diff --git a/docs/api/index.html b/docs/api/index.html index 38be9290..8802c3d3 100644 --- a/docs/api/index.html +++ b/docs/api/index.html @@ -149,6 +149,7 @@

Classification

subsample=1.0, n_jobs=1, max_time_mins=None, max_eval_time_mins=5, random_state=None, config_dict=None, + template="RandomTree", warm_start=False, memory=None, use_dask=False, @@ -246,7 +247,7 @@

Classification

Number of processes to use in parallel for evaluating pipelines during the TPOT optimization process.

-Setting n_jobs=-1 will use as many cores as available on the computer. Beware that using multiple processes on the same machine may cause memory issues for large datasets +Setting n_jobs=-1 will use as many cores as available on the computer. For n_jobs below -1, (n_cpus + 1 + n_jobs) are used. Thus for n_jobs = -2, all CPUs but one are used. Beware that using multiple processes on the same machine may cause memory issues for large datasets.
max_time_mins: integer or None, optional (default=None) @@ -285,6 +286,15 @@

Classification

See the built-in configurations section for the list of configurations included with TPOT, and the custom configuration section for more information and examples of how to create your own TPOT configurations. +template: string (default="RandomTree") +
+Template of predefined pipeline structure. The option is for specifying a desired structure for the machine learning pipeline evaluated in TPOT. +

+So far this option only supports linear pipeline structure. Each step in the pipeline should be a main class of operators (Selector, Transformer, Classifier) or a specific operator (e.g. `SelectPercentile`) defined in TPOT operator configuration. If one step is a main class, TPOT will randomly assign all subclass operators (subclasses of [`SelectorMixin`](https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/feature_selection/base.py#L17), [`TransformerMixin`](https://scikit-learn.org/stable/modules/generated/sklearn.base.TransformerMixin.html), [`ClassifierMixin`](https://scikit-learn.org/stable/modules/generated/sklearn.base.ClassifierMixin.html) in scikit-learn) to that step. Steps in the template are delimited by "-", e.g. "SelectPercentile-Transformer-Classifier". By default value of template is "RandomTree", TPOT generates tree-based pipeline randomly. + +See the template option in tpot section for more details. +
+ warm_start: boolean, optional (default=False)
Flag indicating whether the TPOT instance will reuse the population from previous calls to fit(). @@ -611,6 +621,7 @@

Regression

subsample=1.0, n_jobs=1, max_time_mins=None, max_eval_time_mins=5, random_state=None, config_dict=None, + template="RandomTree", warm_start=False, memory=None, use_dask=False, @@ -709,7 +720,7 @@

Regression

Number of processes to use in parallel for evaluating pipelines during the TPOT optimization process.

-Setting n_jobs=-1 will use as many cores as available on the computer. Beware that using multiple processes on the same machine may cause memory issues for large datasets +Setting n_jobs=-1 will use as many cores as available on the computer. For n_jobs below -1, (n_cpus + 1 + n_jobs) are used. Thus for n_jobs = -2, all CPUs but one are used. Beware that using multiple processes on the same machine may cause memory issues for large datasets
max_time_mins: integer or None, optional (default=None) @@ -748,6 +759,15 @@

Regression

See the built-in configurations section for the list of configurations included with TPOT, and the custom configuration section for more information and examples of how to create your own TPOT configurations.
+template: string (default="RandomTree") +
+Template of predefined pipeline structure. The option is for specifying a desired structure for the machine learning pipeline evaluated in TPOT. +

+So far this option only supports linear pipeline structure. Each step in the pipeline should be a main class of operators (Selector, Transformer or Regressor) or a specific operator (e.g. `SelectPercentile`) defined in TPOT operator configuration. If one step is a main class, TPOT will randomly assign all subclass operators (subclasses of [`SelectorMixin`](https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/feature_selection/base.py#L17), [`TransformerMixin`](https://scikit-learn.org/stable/modules/generated/sklearn.base.TransformerMixin.html) or [`RegressorMixin`](https://scikit-learn.org/stable/modules/generated/sklearn.base.RegressorMixin.html) in scikit-learn) to that step. Steps in the template are delimited by "-", e.g. "SelectPercentile-Transformer-Regressor". By default value of template is "RandomTree", TPOT generates tree-based pipeline randomly. + +See the template option in tpot section for more details. +
+ warm_start: boolean, optional (default=False)
Flag indicating whether the TPOT instance will reuse the population from previous calls to fit(). diff --git a/docs/index.html b/docs/index.html index 014d4776..fae2e13c 100644 --- a/docs/index.html +++ b/docs/index.html @@ -213,5 +213,5 @@ diff --git a/docs/search/search_index.json b/docs/search/search_index.json index 31701985..c3b0281f 100644 --- a/docs/search/search_index.json +++ b/docs/search/search_index.json @@ -12,7 +12,7 @@ }, { "location": "/using/", - "text": "What to expect from AutoML software\n\n\nAutomated machine learning (AutoML) takes a higher-level approach to machine learning than most practitioners are used to,\nso we've gathered a handful of guidelines on what to expect when running AutoML software such as TPOT.\n\n\nAutoML algorithms aren't intended to run for only a few minutes\n\n\n\nOf course, you \ncan\n run TPOT for only a few minutes and it will find a reasonably good pipeline for your dataset.\nHowever, if you don't run TPOT for long enough, it may not find the best possible pipeline for your dataset. It may even not\nfind any suitable pipeline at all, in which case a \nRuntimeError('A pipeline has not yet been optimized. Please call fit() first.')\n\nwill be raised.\nOften it is worthwhile to run multiple instances of TPOT in parallel for a long time (hours to days) to allow TPOT to thoroughly search\nthe pipeline space for your dataset.\n\n\nAutoML algorithms can take a long time to finish their search\n\n\n\nAutoML algorithms aren't as simple as fitting one model on the dataset; they are considering multiple machine learning algorithms\n(random forests, linear models, SVMs, etc.) in a pipeline with multiple preprocessing steps (missing value imputation, scaling,\nPCA, feature selection, etc.), the hyperparameters for all of the models and preprocessing steps, as well as multiple ways\nto ensemble or stack the algorithms within the pipeline.\n\n\nAs such, TPOT will take a while to run on larger datasets, but it's important to realize why. With the default TPOT settings\n(100 generations with 100 population size), TPOT will evaluate 10,000 pipeline configurations before finishing.\nTo put this number into context, think about a grid search of 10,000 hyperparameter combinations for a machine learning algorithm\nand how long that grid search will take. That is 10,000 model configurations to evaluate with 10-fold cross-validation,\nwhich means that roughly 100,000 models are fit and evaluated on the training data in one grid search.\nThat's a time-consuming procedure, even for simpler models like decision trees.\n\n\nTypical TPOT runs will take hours to days to finish (unless it's a small dataset), but you can always interrupt\nthe run partway through and see the best results so far. TPOT also \nprovides\n a \nwarm_start\n parameter that\nlets you restart a TPOT run from where it left off.\n\n\nAutoML algorithms can recommend different solutions for the same dataset\n\n\n\nIf you're working with a reasonably complex dataset or run TPOT for a short amount of time, different TPOT runs\nmay result in different pipeline recommendations. TPOT's optimization algorithm is stochastic in nature, which means\nthat it uses randomness (in part) to search the possible pipeline space. When two TPOT runs recommend different\npipelines, this means that the TPOT runs didn't converge due to lack of time \nor\n that multiple pipelines\nperform more-or-less the same on your dataset.\n\n\nThis is actually an advantage over fixed grid search techniques: TPOT is meant to be an assistant that gives\nyou ideas on how to solve a particular machine learning problem by exploring pipeline configurations that you\nmight have never considered, then leaves the fine-tuning to more constrained parameter tuning techniques such\nas grid search.\n\n\nTPOT with code\n\n\nWe've taken care to design the TPOT interface to be as similar as possible to scikit-learn.\n\n\nTPOT can be imported just like any regular Python module. To import TPOT, type:\n\n\nfrom tpot import TPOTClassifier\n\n\n\n\nthen create an instance of TPOT as follows:\n\n\npipeline_optimizer = TPOTClassifier()\n\n\n\n\nIt's also possible to use TPOT for regression problems with the \nTPOTRegressor\n class. Other than the class name,\na \nTPOTRegressor\n is used the same way as a \nTPOTClassifier\n. You can read more about the \nTPOTClassifier\n and \nTPOTRegressor\n classes in the \nAPI documentation\n.\n\n\nSome example code with custom TPOT parameters might look like:\n\n\npipeline_optimizer = TPOTClassifier(generations=5, population_size=20, cv=5,\n random_state=42, verbosity=2)\n\n\n\n\nNow TPOT is ready to optimize a pipeline for you. You can tell TPOT to optimize a pipeline based on a data set with the \nfit\n function:\n\n\npipeline_optimizer.fit(X_train, y_train)\n\n\n\n\nThe \nfit\n function initializes the genetic programming algorithm to find the highest-scoring pipeline based on average k-fold cross-validation\nThen, the pipeline is trained on the entire set of provided samples, and the TPOT instance can be used as a fitted model.\n\n\nYou can then proceed to evaluate the final pipeline on the testing set with the \nscore\n function:\n\n\nprint(pipeline_optimizer.score(X_test, y_test))\n\n\n\n\nFinally, you can tell TPOT to export the corresponding Python code for the optimized pipeline to a text file with the \nexport\n function:\n\n\npipeline_optimizer.export('tpot_exported_pipeline.py')\n\n\n\n\nOnce this code finishes running, \ntpot_exported_pipeline.py\n will contain the Python code for the optimized pipeline.\n\n\nBelow is a full example script using TPOT to optimize a pipeline, score it, and export the best pipeline to a file.\n\n\nfrom tpot import TPOTClassifier\nfrom sklearn.datasets import load_digits\nfrom sklearn.model_selection import train_test_split\n\ndigits = load_digits()\nX_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target,\n train_size=0.75, test_size=0.25)\n\npipeline_optimizer = TPOTClassifier(generations=5, population_size=20, cv=5,\n random_state=42, verbosity=2)\npipeline_optimizer.fit(X_train, y_train)\nprint(pipeline_optimizer.score(X_test, y_test))\npipeline_optimizer.export('tpot_exported_pipeline.py')\n\n\n\n\nCheck our \nexamples\n to see TPOT applied to some specific data sets.\n\n\nTPOT on the command line\n\n\nTo use TPOT via the command line, enter the following command with a path to the data file:\n\n\ntpot /path_to/data_file.csv\n\n\n\n\nAn example command-line call to TPOT may look like:\n\n\ntpot data/mnist.csv -is , -target class -o tpot_exported_pipeline.py -g 5 -p 20 -cv 5 -s 42 -v 2\n\n\n\n\nTPOT offers several arguments that can be provided at the command line. To see brief descriptions of these arguments,\nenter the following command:\n\n\ntpot --help\n\n\n\n\nDetailed descriptions of the command-line arguments are below.\n\n\n\n\n\n\nArgument\n\n\nParameter\n\n\nValid values\n\n\nEffect\n\n\n\n\n\n\n-is\n\n\nINPUT_SEPARATOR\n\n\nAny string\n\n\nCharacter used to separate columns in the input file.\n\n\n\n\n\n\n-target\n\n\nTARGET_NAME\n\n\nAny string\n\n\nName of the target column in the input file.\n\n\n\n\n\n\n-mode\n\n\nTPOT_MODE\n\n\n['classification', 'regression']\n\n\nWhether TPOT is being used for a supervised classification or regression problem.\n\n\n\n\n\n\n-o\n\n\nOUTPUT_FILE\n\n\nString path to a file\n\n\nFile to export the code for the final optimized pipeline.\n\n\n\n\n\n\n-g\n\n\nGENERATIONS\n\n\nAny positive integer\n\n\nNumber of iterations to run the pipeline optimization process. Generally, TPOT will work better when you give it more generations (and therefore time) to optimize the pipeline.\n\n\nTPOT will evaluate POPULATION_SIZE + GENERATIONS x OFFSPRING_SIZE pipelines in total.\n\n\n\n\n\n\n-p\n\n\nPOPULATION_SIZE\n\n\nAny positive integer\n\n\nNumber of individuals to retain in the GP population every generation. Generally, TPOT will work better when you give it more individuals (and therefore time) to optimize the pipeline.\n\n\nTPOT will evaluate POPULATION_SIZE + GENERATIONS x OFFSPRING_SIZE pipelines in total.\n\n\n\n\n\n\n-os\n\n\nOFFSPRING_SIZE\n\n\nAny positive integer\n\n\nNumber of offspring to produce in each GP generation.\n\n\nBy default, OFFSPRING_SIZE = POPULATION_SIZE.\n\n\n\n\n\n\n-mr\n\n\nMUTATION_RATE\n\n\n[0.0, 1.0]\n\n\nGP mutation rate in the range [0.0, 1.0]. This tells the GP algorithm how many pipelines to apply random changes to every generation.\n\n\nWe recommend using the default parameter unless you understand how the mutation rate affects GP algorithms.\n\n\n\n\n\n\n-xr\n\n\nCROSSOVER_RATE\n\n\n[0.0, 1.0]\n\n\nGP crossover rate in the range [0.0, 1.0]. This tells the GP algorithm how many pipelines to \"breed\" every generation.\n\n\nWe recommend using the default parameter unless you understand how the crossover rate affects GP algorithms.\n\n\n\n\n\n\n-scoring\n\n\nSCORING_FN\n\n\n'accuracy', 'adjusted_rand_score', 'average_precision', 'balanced_accuracy',\n'f1',\n'f1_macro', 'f1_micro', 'f1_samples', 'f1_weighted', 'neg_log_loss', 'neg_mean_absolute_error',\n'neg_mean_squared_error', 'neg_median_absolute_error', 'precision', 'precision_macro', 'precision_micro',\n'precision_samples', 'precision_weighted',\n'r2', 'recall', 'recall_macro', 'recall_micro', 'recall_samples',\n'recall_weighted', 'roc_auc', 'my_module.scorer_name*'\n\n\nFunction used to evaluate the quality of a given pipeline for the problem. By default, accuracy is used for classification and mean squared error (MSE) is used for regression.\n\n\nTPOT assumes that any function with \"error\" or \"loss\" in the name is meant to be minimized, whereas any other functions will be maximized.\n\n\nmy_module.scorer_name: You can also specify your own function or a full python path to an existing one.\n\n\nSee the section on \nscoring functions\n for more details.\n\n\n\n\n\n\n-cv\n\n\nCV\n\n\nAny integer > 1\n\n\nNumber of folds to evaluate each pipeline over in k-fold cross-validation during the TPOT optimization process.\n\n\n\n\n-sub\n\n\nSUBSAMPLE\n\n\n(0.0, 1.0]\n\n\nSubsample ratio of the training instance. Setting it to 0.5 means that TPOT randomly collects half of training samples for pipeline optimization process.\n\n\n\n\n\n\n-njobs\n\n\nNUM_JOBS\n\n\nAny positive integer or -1\n\n\nNumber of CPUs for evaluating pipelines in parallel during the TPOT optimization process.\n\n\nAssigning this to -1 will use as many cores as available on the computer.\n\n\n\n\n\n\n-maxtime\n\n\nMAX_TIME_MINS\n\n\nAny positive integer\n\n\nHow many minutes TPOT has to optimize the pipeline.\n\n\nIf provided, this setting will override the \"generations\" parameter and allow TPOT to run until it runs out of time.\n\n\n\n\n\n\n-maxeval\n\n\nMAX_EVAL_MINS\n\n\nAny positive float\n\n\nHow many minutes TPOT has to evaluate a single pipeline.\n\n\nSetting this parameter to higher values will allow TPOT to consider more complex pipelines but will also allow TPOT to run longer.\n\n\n\n\n\n\n-s\n\n\nRANDOM_STATE\n\n\nAny positive integer\n\n\nRandom number generator seed for reproducibility.\n\n\nSet this seed if you want your TPOT run to be reproducible with the same seed and data set in the future.\n\n\n\n\n\n\n-config\n\n\nCONFIG_FILE\n\n\nString or file path\n\n\nOperators and parameter configurations in TPOT:\n\n\n\n\n\nPath for configuration file: TPOT will use the path to a configuration file for customizing the operators and parameters that TPOT uses in the optimization process\n\n\nstring 'TPOT light', TPOT will use a built-in configuration with only fast models and preprocessors\n\n\nstring 'TPOT MDR', TPOT will use a built-in configuration specialized for genomic studies\n\n\nstring 'TPOT sparse': TPOT will use a configuration dictionary with a one-hot encoder and the operators normally included in TPOT that also support sparse matrices.\n\n\n\nSee the \nbuilt-in configurations\n section for the list of configurations included with TPOT, and the \ncustom configuration\n section for more information and examples of how to create your own TPOT configurations.\n\n\n\n\n\n\n\n-memory\n\n\nMEMORY\n\n\nString or file path\n\n\nIf supplied, pipeline will cache each transformer after calling fit. This feature is used to avoid computing the fit transformers within a pipeline if the parameters and input data are identical with another fitted pipeline during optimization process. Memory caching mode in TPOT:\n\n\n\n\n\nPath for a caching directory: TPOT uses memory caching with the provided directory and TPOT does NOT clean the caching directory up upon shutdown.\n\n\nstring 'auto': TPOT uses memory caching with a temporary directory and cleans it up upon shutdown.\n\n\n\n\n\n\n\n\n\n\n-cf\n\n\nCHECKPOINT_FOLDER\n\n\nFolder path\n\n\n\nIf supplied, a folder you created, in which tpot will periodically save pipelines in pareto front so far while optimizing.\n\n\nThis is useful in multiple cases:\n\n\n\nsudden death before tpot could save an optimized pipeline\n\n\nprogress tracking\n\n\ngrabbing a pipeline while tpot is working\n\n\n\n\n\nExample:\n\n\nmkdir my_checkpoints\n\n\n-cf ./my_checkpoints\n\n\n\n\n\n-es\n\n\nEARLY_STOP\n\n\nAny positive integer\n\n\n\nHow many generations TPOT checks whether there is no improvement in optimization process.\n\n\nEnd optimization process if there is no improvement in the set number of generations.\n\n\n\n\n\n-v\n\n\nVERBOSITY\n\n\n{0, 1, 2, 3}\n\n\nHow much information TPOT communicates while it is running.\n\n\n0 = none, 1 = minimal, 2 = high, 3 = all.\n\n\nA setting of 2 or higher will add a progress bar during the optimization procedure.\n\n\n\n\n\n\n--no-update-check\n\n\nFlag indicating whether the TPOT version checker should be disabled.\n\n\n\n\n\n\n--version\n\n\nShow TPOT's version number and exit.\n\n\n\n\n\n\n--help\n\n\nShow TPOT's help documentation and exit.\n\n\n\n\n\n\n\nScoring functions\n\n\nTPOT makes use of \nsklearn.model_selection.cross_val_score\n for evaluating pipelines, and as such offers the same support for scoring functions. There are two ways to make use of scoring functions with TPOT:\n\n\n\n\n\n\nYou can pass in a string to the \nscoring\n parameter from the list above. Any other strings will cause TPOT to throw an exception.\n\n\n\n\n\n\nYou can pass the callable object/function with signature \nscorer(estimator, X, y)\n, where \nestimator\n is trained estimator to use for scoring, \nX\n are features that will be passed to \nestimator.predict\n and \ny\n are target values for \nX\n. To do this, you should implement your own function. See the example below for further explanation.\n\n\n\n\n\n\nfrom tpot import TPOTClassifier\nfrom sklearn.datasets import load_digits\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.metrics.scorer import make_scorer\n\ndigits = load_digits()\nX_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target,\n train_size=0.75, test_size=0.25)\n# Make a custom metric function\ndef my_custom_accuracy(y_true, y_pred):\n return float(sum(y_pred == y_true)) / len(y_true)\n\n# Make a custom a scorer from the custom metric function\n# Note: greater_is_better=False in make_scorer below would mean that the scoring function should be minimized.\nmy_custom_scorer = make_scorer(my_custom_accuracy, greater_is_better=True)\n\ntpot = TPOTClassifier(generations=5, population_size=20, verbosity=2,\n scoring=my_custom_scorer)\ntpot.fit(X_train, y_train)\nprint(tpot.score(X_test, y_test))\ntpot.export('tpot_mnist_pipeline.py')\n\n\n\n\n\n\n\n\nYou can pass a metric function with the signature \nscore_func(y_true, y_pred)\n (e.g. \nmy_custom_accuracy\n in the example above), where \ny_true\n are the true target values and \ny_pred\n are the predicted target values from an estimator. To do this, you should implement your own function. See the example above for further explanation. TPOT assumes that any function with \"error\" or \"loss\" in the function name is meant to be minimized (\ngreater_is_better=False\n in \nmake_scorer\n), whereas any other functions will be maximized. This scoring type was deprecated in version 0.9.1 and will be removed in version 0.11.\n\n\n\n\n\n\nmy_module.scorer_name\n: You can also use a custom \nscore_func(y_true, y_pred)\n or \nscorer(estimator, X, y)\n function through the command line by adding the argument \n-scoring my_module.scorer\n to your command-line call. TPOT will import your module and use the custom scoring function from there. TPOT will include your current working directory when importing the module, so you can place it in the same directory where you are going to run TPOT.\nExample: \n-scoring sklearn.metrics.auc\n will use the function auc from sklearn.metrics module.\n\n\n\n\n\n\nBuilt-in TPOT configurations\n\n\nTPOT comes with a handful of default operators and parameter configurations that we believe work well for optimizing machine learning pipelines. Below is a list of the current built-in configurations that come with TPOT.\n\n\n\n\n\n\nConfiguration Name\n\n\nDescription\n\n\nOperators\n\n\n\n\n\n\n\nDefault TPOT\n\n\nTPOT will search over a broad range of preprocessors, feature constructors, feature selectors, models, and parameters to find a series of operators that minimize the error of the model predictions. Some of these operators are complex and may take a long time to run, especially on larger datasets.\n\n\n\nNote: This is the default configuration for TPOT.\n To use this configuration, use the default value (None) for the config_dict parameter.\n\n\nClassification\n\n\n\n\nRegression\n\n\n\n\n\n\n\nTPOT light\n\n\nTPOT will search over a restricted range of preprocessors, feature constructors, feature selectors, models, and parameters to find a series of operators that minimize the error of the model predictions. Only simpler and fast-running operators will be used in these pipelines, so TPOT light is useful for finding quick and simple pipelines for a classification or regression problem.\n\n\nThis configuration works for both the TPOTClassifier and TPOTRegressor.\n\n\nClassification\n\n\n\n\nRegression\n\n\n\n\n\n\n\nTPOT MDR\n\n\nTPOT will search over a series of feature selectors and \nMultifactor Dimensionality Reduction\n models to find a series of operators that maximize prediction accuracy. The TPOT MDR configuration is specialized for \ngenome-wide association studies (GWAS)\n, and is described in detail online \nhere\n.\n\n\nNote that TPOT MDR may be slow to run because the feature selection routines are computationally expensive, especially on large datasets.\n\n\nClassification\n\n\n\n\nRegression\n\n\n\n\n\n\n\nTPOT sparse\n\n\nTPOT uses a configuration dictionary with a one-hot encoder and the operators normally included in TPOT that also support sparse matrices.\n\n\nThis configuration works for both the TPOTClassifier and TPOTRegressor.\n\n\nClassification\n\n\n\n\nRegression\n\n\n\n\n\n\n\n\nTo use any of these configurations, simply pass the string name of the configuration to the \nconfig_dict\n parameter (or \n-config\n on the command line). For example, to use the \"TPOT light\" configuration:\n\n\nfrom tpot import TPOTClassifier\nfrom sklearn.datasets import load_digits\nfrom sklearn.model_selection import train_test_split\n\ndigits = load_digits()\nX_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target,\n train_size=0.75, test_size=0.25)\n\ntpot = TPOTClassifier(generations=5, population_size=20, verbosity=2,\n config_dict='TPOT light')\ntpot.fit(X_train, y_train)\nprint(tpot.score(X_test, y_test))\ntpot.export('tpot_mnist_pipeline.py')\n\n\n\n\n\nCustomizing TPOT's operators and parameters\n\n\nBeyond the default configurations that come with TPOT, in some cases it is useful to limit the algorithms and parameters that TPOT considers. For that reason, we allow users to provide TPOT with a custom configuration for its operators and parameters.\n\n\nThe custom TPOT configuration must be in nested dictionary format, where the first level key is the path and name of the operator (e.g., \nsklearn.naive_bayes.MultinomialNB\n) and the second level key is the corresponding parameter name for that operator (e.g., \nfit_prior\n). The second level key should point to a list of parameter values for that parameter, e.g., \n'fit_prior': [True, False]\n.\n\n\nFor a simple example, the configuration could be:\n\n\ntpot_config = {\n 'sklearn.naive_bayes.GaussianNB': {\n },\n\n 'sklearn.naive_bayes.BernoulliNB': {\n 'alpha': [1e-3, 1e-2, 1e-1, 1., 10., 100.],\n 'fit_prior': [True, False]\n },\n\n 'sklearn.naive_bayes.MultinomialNB': {\n 'alpha': [1e-3, 1e-2, 1e-1, 1., 10., 100.],\n 'fit_prior': [True, False]\n }\n}\n\n\n\n\nin which case TPOT would only consider pipelines containing \nGaussianNB\n, \nBernoulliNB\n, \nMultinomialNB\n, and tune those algorithm's parameters in the ranges provided. This dictionary can be passed directly within the code to the \nTPOTClassifier\n/\nTPOTRegressor\n \nconfig_dict\n parameter, described above. For example:\n\n\nfrom tpot import TPOTClassifier\nfrom sklearn.datasets import load_digits\nfrom sklearn.model_selection import train_test_split\n\ndigits = load_digits()\nX_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target,\n train_size=0.75, test_size=0.25)\n\ntpot_config = {\n 'sklearn.naive_bayes.GaussianNB': {\n },\n\n 'sklearn.naive_bayes.BernoulliNB': {\n 'alpha': [1e-3, 1e-2, 1e-1, 1., 10., 100.],\n 'fit_prior': [True, False]\n },\n\n 'sklearn.naive_bayes.MultinomialNB': {\n 'alpha': [1e-3, 1e-2, 1e-1, 1., 10., 100.],\n 'fit_prior': [True, False]\n }\n}\n\ntpot = TPOTClassifier(generations=5, population_size=20, verbosity=2,\n config_dict=tpot_config)\ntpot.fit(X_train, y_train)\nprint(tpot.score(X_test, y_test))\ntpot.export('tpot_mnist_pipeline.py')\n\n\n\n\nCommand-line users must create a separate \n.py\n file with the custom configuration and provide the path to the file to the \ntpot\n call. For example, if the simple example configuration above is saved in \ntpot_classifier_config.py\n, that configuration could be used on the command line with the command:\n\n\ntpot data/mnist.csv -is , -target class -config tpot_classifier_config.py -g 5 -p 20 -v 2 -o tpot_exported_pipeline.py\n\n\n\n\nWhen using the command-line interface, the configuration file specified in the \n-config\n parameter \nmust\n name its custom TPOT configuration \ntpot_config\n. Otherwise, TPOT will not be able to locate the configuration dictionary.\n\n\nFor more detailed examples of how to customize TPOT's operator configuration, see the default configurations for \nclassification\n and \nregression\n in TPOT's source code.\n\n\nNote that you must have all of the corresponding packages for the operators installed on your computer, otherwise TPOT will not be able to use them. For example, if XGBoost is not installed on your computer, then TPOT will simply not import nor use XGBoost in the pipelines it considers.\n\n\nPipeline caching in TPOT\n\n\nWith the \nmemory\n parameter, pipelines can cache the results of each transformer after fitting them. This feature is used to avoid repeated computation by transformers within a pipeline if the parameters and input data are identical to another fitted pipeline during optimization process. TPOT allows users to specify a custom directory path or \nsklearn.external.joblib.Memory\n in case they want to re-use the memory cache in future TPOT runs (or a \nwarm_start\n run).\n\n\nThere are three methods for enabling memory caching in TPOT:\n\n\nfrom tpot import TPOTClassifier\nfrom tempfile import mkdtemp\nfrom sklearn.externals.joblib import Memory\nfrom shutil import rmtree\n\n# Method 1, auto mode: TPOT uses memory caching with a temporary directory and cleans it up upon shutdown\ntpot = TPOTClassifier(memory='auto')\n\n# Method 2, with a custom directory for memory caching\ntpot = TPOTClassifier(memory='/to/your/path')\n\n# Method 3, with a Memory object\ncachedir = mkdtemp() # Create a temporary folder\nmemory = Memory(cachedir=cachedir, verbose=0)\ntpot = TPOTClassifier(memory=memory)\n\n# Clear the cache directory when you don't need it anymore\nrmtree(cachedir)\n\n\n\n\nNote: TPOT does NOT clean up memory caches if users set a custom directory path or Memory object. We recommend that you clean up the memory caches when you don't need it anymore.\n\n\nCrash/freeze issue with n_jobs > 1 under OSX or Linux\n\n\nInternally, TPOT uses \njoblib\n to fit estimators in parallel.\nThis is the same parallelization framework used by scikit-learn. But it may crash/freeze with n_jobs > 1 under OSX or Linux \nas scikit-learn does\n, especially with large datasets.\n\n\nOne solution is to configure Python's \nmultiprocessing\n module to use the \nforkserver\n start method (instead of the default \nfork\n) to manage the process pools. You can enable the \nforkserver\n mode globally for your program by putting the following codes into your main script:\n\n\nimport multiprocessing\n\n# other imports, custom code, load data, define model...\n\nif __name__ == '__main__':\n multiprocessing.set_start_method('forkserver')\n\n # call scikit-learn utils or tpot utils with n_jobs > 1 here\n\n\n\n\nMore information about these start methods can be found in the \nmultiprocessing documentation\n.\n\n\nParallel Training with Dask\n\n\nFor large problems or working on Jupyter notebook, we highly recommend that you can distribute the work on a \nDask\n cluster.\nThe \ndask-examples binder\n has a runnable example\nwith a small dask cluster.\n\n\nTo use your Dask cluster to fit a TPOT model, specify the \nuse_dask\n keyword when you create the TPOT estimator. \nNote: if \nuse_dask=True\n, TPOT will use as many cores as available on the your Dask cluster regardless of whether \nn_jobs\n is specified.\n\n\nestimator = TPOTEstimator(use_dask=True)\n\n\n\n\nThis will use use all the workers on your cluster to do the training, and use \nDask-ML's pipeline rewriting\n to avoid re-fitting estimators multiple times on the same set of data.\nIt will also provide fine-grained diagnostics in the \ndistributed scheduler UI\n.\n\n\nAlternatively, Dask implements a joblib backend.\nYou can instruct TPOT to use the distribued backend during training by specifying a \njoblib.parallel_backend\n:\n\n\nfrom sklearn.externals import joblib\nimport distributed.joblib\nfrom dask.distributed import Client\n\n# connect to the cluster\nclient = Client('schedueler-address')\n\n# create the estimator normally\nestimator = TPOTClassifier(n_jobs=-1)\n\n# perform the fit in this context manager\nwith joblib.parallel_backend(\"dask\"):\n estimator.fit(X, y)\n\n\n\n\nSee \ndask's distributed joblib integration\n for more.", + "text": "What to expect from AutoML software\n\n\nAutomated machine learning (AutoML) takes a higher-level approach to machine learning than most practitioners are used to,\nso we've gathered a handful of guidelines on what to expect when running AutoML software such as TPOT.\n\n\nAutoML algorithms aren't intended to run for only a few minutes\n\n\n\nOf course, you \ncan\n run TPOT for only a few minutes and it will find a reasonably good pipeline for your dataset.\nHowever, if you don't run TPOT for long enough, it may not find the best possible pipeline for your dataset. It may even not\nfind any suitable pipeline at all, in which case a \nRuntimeError('A pipeline has not yet been optimized. Please call fit() first.')\n\nwill be raised.\nOften it is worthwhile to run multiple instances of TPOT in parallel for a long time (hours to days) to allow TPOT to thoroughly search\nthe pipeline space for your dataset.\n\n\nAutoML algorithms can take a long time to finish their search\n\n\n\nAutoML algorithms aren't as simple as fitting one model on the dataset; they are considering multiple machine learning algorithms\n(random forests, linear models, SVMs, etc.) in a pipeline with multiple preprocessing steps (missing value imputation, scaling,\nPCA, feature selection, etc.), the hyperparameters for all of the models and preprocessing steps, as well as multiple ways\nto ensemble or stack the algorithms within the pipeline.\n\n\nAs such, TPOT will take a while to run on larger datasets, but it's important to realize why. With the default TPOT settings\n(100 generations with 100 population size), TPOT will evaluate 10,000 pipeline configurations before finishing.\nTo put this number into context, think about a grid search of 10,000 hyperparameter combinations for a machine learning algorithm\nand how long that grid search will take. That is 10,000 model configurations to evaluate with 10-fold cross-validation,\nwhich means that roughly 100,000 models are fit and evaluated on the training data in one grid search.\nThat's a time-consuming procedure, even for simpler models like decision trees.\n\n\nTypical TPOT runs will take hours to days to finish (unless it's a small dataset), but you can always interrupt\nthe run partway through and see the best results so far. TPOT also \nprovides\n a \nwarm_start\n parameter that\nlets you restart a TPOT run from where it left off.\n\n\nAutoML algorithms can recommend different solutions for the same dataset\n\n\n\nIf you're working with a reasonably complex dataset or run TPOT for a short amount of time, different TPOT runs\nmay result in different pipeline recommendations. TPOT's optimization algorithm is stochastic in nature, which means\nthat it uses randomness (in part) to search the possible pipeline space. When two TPOT runs recommend different\npipelines, this means that the TPOT runs didn't converge due to lack of time \nor\n that multiple pipelines\nperform more-or-less the same on your dataset.\n\n\nThis is actually an advantage over fixed grid search techniques: TPOT is meant to be an assistant that gives\nyou ideas on how to solve a particular machine learning problem by exploring pipeline configurations that you\nmight have never considered, then leaves the fine-tuning to more constrained parameter tuning techniques such\nas grid search.\n\n\nTPOT with code\n\n\nWe've taken care to design the TPOT interface to be as similar as possible to scikit-learn.\n\n\nTPOT can be imported just like any regular Python module. To import TPOT, type:\n\n\nfrom tpot import TPOTClassifier\n\n\n\n\nthen create an instance of TPOT as follows:\n\n\npipeline_optimizer = TPOTClassifier()\n\n\n\n\nIt's also possible to use TPOT for regression problems with the \nTPOTRegressor\n class. Other than the class name,\na \nTPOTRegressor\n is used the same way as a \nTPOTClassifier\n. You can read more about the \nTPOTClassifier\n and \nTPOTRegressor\n classes in the \nAPI documentation\n.\n\n\nSome example code with custom TPOT parameters might look like:\n\n\npipeline_optimizer = TPOTClassifier(generations=5, population_size=20, cv=5,\n random_state=42, verbosity=2)\n\n\n\n\nNow TPOT is ready to optimize a pipeline for you. You can tell TPOT to optimize a pipeline based on a data set with the \nfit\n function:\n\n\npipeline_optimizer.fit(X_train, y_train)\n\n\n\n\nThe \nfit\n function initializes the genetic programming algorithm to find the highest-scoring pipeline based on average k-fold cross-validation\nThen, the pipeline is trained on the entire set of provided samples, and the TPOT instance can be used as a fitted model.\n\n\nYou can then proceed to evaluate the final pipeline on the testing set with the \nscore\n function:\n\n\nprint(pipeline_optimizer.score(X_test, y_test))\n\n\n\n\nFinally, you can tell TPOT to export the corresponding Python code for the optimized pipeline to a text file with the \nexport\n function:\n\n\npipeline_optimizer.export('tpot_exported_pipeline.py')\n\n\n\n\nOnce this code finishes running, \ntpot_exported_pipeline.py\n will contain the Python code for the optimized pipeline.\n\n\nBelow is a full example script using TPOT to optimize a pipeline, score it, and export the best pipeline to a file.\n\n\nfrom tpot import TPOTClassifier\nfrom sklearn.datasets import load_digits\nfrom sklearn.model_selection import train_test_split\n\ndigits = load_digits()\nX_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target,\n train_size=0.75, test_size=0.25)\n\npipeline_optimizer = TPOTClassifier(generations=5, population_size=20, cv=5,\n random_state=42, verbosity=2)\npipeline_optimizer.fit(X_train, y_train)\nprint(pipeline_optimizer.score(X_test, y_test))\npipeline_optimizer.export('tpot_exported_pipeline.py')\n\n\n\n\nCheck our \nexamples\n to see TPOT applied to some specific data sets.\n\n\nTPOT on the command line\n\n\nTo use TPOT via the command line, enter the following command with a path to the data file:\n\n\ntpot /path_to/data_file.csv\n\n\n\n\nAn example command-line call to TPOT may look like:\n\n\ntpot data/mnist.csv -is , -target class -o tpot_exported_pipeline.py -g 5 -p 20 -cv 5 -s 42 -v 2\n\n\n\n\nTPOT offers several arguments that can be provided at the command line. To see brief descriptions of these arguments,\nenter the following command:\n\n\ntpot --help\n\n\n\n\nDetailed descriptions of the command-line arguments are below.\n\n\n\n\n\n\nArgument\n\n\nParameter\n\n\nValid values\n\n\nEffect\n\n\n\n\n\n\n-is\n\n\nINPUT_SEPARATOR\n\n\nAny string\n\n\nCharacter used to separate columns in the input file.\n\n\n\n\n\n\n-target\n\n\nTARGET_NAME\n\n\nAny string\n\n\nName of the target column in the input file.\n\n\n\n\n\n\n-mode\n\n\nTPOT_MODE\n\n\n['classification', 'regression']\n\n\nWhether TPOT is being used for a supervised classification or regression problem.\n\n\n\n\n\n\n-o\n\n\nOUTPUT_FILE\n\n\nString path to a file\n\n\nFile to export the code for the final optimized pipeline.\n\n\n\n\n\n\n-g\n\n\nGENERATIONS\n\n\nAny positive integer\n\n\nNumber of iterations to run the pipeline optimization process. Generally, TPOT will work better when you give it more generations (and therefore time) to optimize the pipeline.\n\n\nTPOT will evaluate POPULATION_SIZE + GENERATIONS x OFFSPRING_SIZE pipelines in total.\n\n\n\n\n\n\n-p\n\n\nPOPULATION_SIZE\n\n\nAny positive integer\n\n\nNumber of individuals to retain in the GP population every generation. Generally, TPOT will work better when you give it more individuals (and therefore time) to optimize the pipeline.\n\n\nTPOT will evaluate POPULATION_SIZE + GENERATIONS x OFFSPRING_SIZE pipelines in total.\n\n\n\n\n\n\n-os\n\n\nOFFSPRING_SIZE\n\n\nAny positive integer\n\n\nNumber of offspring to produce in each GP generation.\n\n\nBy default, OFFSPRING_SIZE = POPULATION_SIZE.\n\n\n\n\n\n\n-mr\n\n\nMUTATION_RATE\n\n\n[0.0, 1.0]\n\n\nGP mutation rate in the range [0.0, 1.0]. This tells the GP algorithm how many pipelines to apply random changes to every generation.\n\n\nWe recommend using the default parameter unless you understand how the mutation rate affects GP algorithms.\n\n\n\n\n\n\n-xr\n\n\nCROSSOVER_RATE\n\n\n[0.0, 1.0]\n\n\nGP crossover rate in the range [0.0, 1.0]. This tells the GP algorithm how many pipelines to \"breed\" every generation.\n\n\nWe recommend using the default parameter unless you understand how the crossover rate affects GP algorithms.\n\n\n\n\n\n\n-scoring\n\n\nSCORING_FN\n\n\n'accuracy', 'adjusted_rand_score', 'average_precision', 'balanced_accuracy',\n'f1',\n'f1_macro', 'f1_micro', 'f1_samples', 'f1_weighted', 'neg_log_loss', 'neg_mean_absolute_error',\n'neg_mean_squared_error', 'neg_median_absolute_error', 'precision', 'precision_macro', 'precision_micro',\n'precision_samples', 'precision_weighted',\n'r2', 'recall', 'recall_macro', 'recall_micro', 'recall_samples',\n'recall_weighted', 'roc_auc', 'my_module.scorer_name*'\n\n\nFunction used to evaluate the quality of a given pipeline for the problem. By default, accuracy is used for classification and mean squared error (MSE) is used for regression.\n\n\nTPOT assumes that any function with \"error\" or \"loss\" in the name is meant to be minimized, whereas any other functions will be maximized.\n\n\nmy_module.scorer_name: You can also specify your own function or a full python path to an existing one.\n\n\nSee the section on \nscoring functions\n for more details.\n\n\n\n\n\n\n-cv\n\n\nCV\n\n\nAny integer > 1\n\n\nNumber of folds to evaluate each pipeline over in k-fold cross-validation during the TPOT optimization process.\n\n\n\n\n-sub\n\n\nSUBSAMPLE\n\n\n(0.0, 1.0]\n\n\nSubsample ratio of the training instance. Setting it to 0.5 means that TPOT randomly collects half of training samples for pipeline optimization process.\n\n\n\n\n\n\n-njobs\n\n\nNUM_JOBS\n\n\nAny positive integer or -1\n\n\nNumber of CPUs for evaluating pipelines in parallel during the TPOT optimization process.\n\n\nAssigning this to -1 will use as many cores as available on the computer. For n_jobs below -1, (n_cpus + 1 + n_jobs) are used. Thus for n_jobs = -2, all CPUs but one are used.\n\n\n\n\n\n\n-maxtime\n\n\nMAX_TIME_MINS\n\n\nAny positive integer\n\n\nHow many minutes TPOT has to optimize the pipeline.\n\n\nIf provided, this setting will override the \"generations\" parameter and allow TPOT to run until it runs out of time.\n\n\n\n\n\n\n-maxeval\n\n\nMAX_EVAL_MINS\n\n\nAny positive float\n\n\nHow many minutes TPOT has to evaluate a single pipeline.\n\n\nSetting this parameter to higher values will allow TPOT to consider more complex pipelines but will also allow TPOT to run longer.\n\n\n\n\n\n\n-s\n\n\nRANDOM_STATE\n\n\nAny positive integer\n\n\nRandom number generator seed for reproducibility.\n\n\nSet this seed if you want your TPOT run to be reproducible with the same seed and data set in the future.\n\n\n\n\n\n\n-config\n\n\nCONFIG_FILE\n\n\nString or file path\n\n\nOperators and parameter configurations in TPOT:\n\n\n\n\n\nPath for configuration file: TPOT will use the path to a configuration file for customizing the operators and parameters that TPOT uses in the optimization process\n\n\nstring 'TPOT light', TPOT will use a built-in configuration with only fast models and preprocessors\n\n\nstring 'TPOT MDR', TPOT will use a built-in configuration specialized for genomic studies\n\n\nstring 'TPOT sparse': TPOT will use a configuration dictionary with a one-hot encoder and the operators normally included in TPOT that also support sparse matrices.\n\n\n\nSee the \nbuilt-in configurations\n section for the list of configurations included with TPOT, and the \ncustom configuration\n section for more information and examples of how to create your own TPOT configurations.\n\n\n\n\n\n\n\n-template\n\n\nTEMPLATE\n\n\nString\n\n\nTemplate of predefined pipeline structure. The option is for specifying a desired structure for the machine learning pipeline evaluated in TPOT. So far this option only supports linear pipeline structure. Each step in the pipeline should be a main class of operators (Selector, Transformer, Classifier or Regressor) or a specific operator (e.g. `SelectPercentile`) defined in TPOT operator configuration. If one step is a main class, TPOT will randomly assign all subclass operators (subclasses of [`SelectorMixin`](https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/feature_selection/base.py#L17), [`TransformerMixin`](https://scikit-learn.org/stable/modules/generated/sklearn.base.TransformerMixin.html), [`ClassifierMixin`](https://scikit-learn.org/stable/modules/generated/sklearn.base.ClassifierMixin.html) or [`RegressorMixin`](https://scikit-learn.org/stable/modules/generated/sklearn.base.RegressorMixin.html) in scikit-learn) to that step. Steps in the template are delimited by \"-\", e.g. \"SelectPercentile-Transformer-Classifier\". By default value of template is \"RandomTree\", TPOT generates tree-based pipeline randomly.\n\nSee the \n template option in tpot\n section for more details.\n\n\n\n\n\n\n\n-memory\n\n\nMEMORY\n\n\nString or file path\n\n\nIf supplied, pipeline will cache each transformer after calling fit. This feature is used to avoid computing the fit transformers within a pipeline if the parameters and input data are identical with another fitted pipeline during optimization process. Memory caching mode in TPOT:\n\n\n\n\n\nPath for a caching directory: TPOT uses memory caching with the provided directory and TPOT does NOT clean the caching directory up upon shutdown.\n\n\nstring 'auto': TPOT uses memory caching with a temporary directory and cleans it up upon shutdown.\n\n\n\n\n\n\n\n\n\n\n-cf\n\n\nCHECKPOINT_FOLDER\n\n\nFolder path\n\n\n\nIf supplied, a folder you created, in which tpot will periodically save pipelines in pareto front so far while optimizing.\n\n\nThis is useful in multiple cases:\n\n\n\nsudden death before tpot could save an optimized pipeline\n\n\nprogress tracking\n\n\ngrabbing a pipeline while tpot is working\n\n\n\n\n\nExample:\n\n\nmkdir my_checkpoints\n\n\n-cf ./my_checkpoints\n\n\n\n\n\n-es\n\n\nEARLY_STOP\n\n\nAny positive integer\n\n\n\nHow many generations TPOT checks whether there is no improvement in optimization process.\n\n\nEnd optimization process if there is no improvement in the set number of generations.\n\n\n\n\n\n-v\n\n\nVERBOSITY\n\n\n{0, 1, 2, 3}\n\n\nHow much information TPOT communicates while it is running.\n\n\n0 = none, 1 = minimal, 2 = high, 3 = all.\n\n\nA setting of 2 or higher will add a progress bar during the optimization procedure.\n\n\n\n\n\n\n--no-update-check\n\n\nFlag indicating whether the TPOT version checker should be disabled.\n\n\n\n\n\n\n--version\n\n\nShow TPOT's version number and exit.\n\n\n\n\n\n\n--help\n\n\nShow TPOT's help documentation and exit.\n\n\n\n\n\n\n\nScoring functions\n\n\nTPOT makes use of \nsklearn.model_selection.cross_val_score\n for evaluating pipelines, and as such offers the same support for scoring functions. There are two ways to make use of scoring functions with TPOT:\n\n\n\n\n\n\nYou can pass in a string to the \nscoring\n parameter from the list above. Any other strings will cause TPOT to throw an exception.\n\n\n\n\n\n\nYou can pass the callable object/function with signature \nscorer(estimator, X, y)\n, where \nestimator\n is trained estimator to use for scoring, \nX\n are features that will be passed to \nestimator.predict\n and \ny\n are target values for \nX\n. To do this, you should implement your own function. See the example below for further explanation.\n\n\n\n\n\n\nfrom tpot import TPOTClassifier\nfrom sklearn.datasets import load_digits\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.metrics.scorer import make_scorer\n\ndigits = load_digits()\nX_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target,\n train_size=0.75, test_size=0.25)\n# Make a custom metric function\ndef my_custom_accuracy(y_true, y_pred):\n return float(sum(y_pred == y_true)) / len(y_true)\n\n# Make a custom a scorer from the custom metric function\n# Note: greater_is_better=False in make_scorer below would mean that the scoring function should be minimized.\nmy_custom_scorer = make_scorer(my_custom_accuracy, greater_is_better=True)\n\ntpot = TPOTClassifier(generations=5, population_size=20, verbosity=2,\n scoring=my_custom_scorer)\ntpot.fit(X_train, y_train)\nprint(tpot.score(X_test, y_test))\ntpot.export('tpot_mnist_pipeline.py')\n\n\n\n\n\n\n\n\nYou can pass a metric function with the signature \nscore_func(y_true, y_pred)\n (e.g. \nmy_custom_accuracy\n in the example above), where \ny_true\n are the true target values and \ny_pred\n are the predicted target values from an estimator. To do this, you should implement your own function. See the example above for further explanation. TPOT assumes that any function with \"error\" or \"loss\" in the function name is meant to be minimized (\ngreater_is_better=False\n in \nmake_scorer\n), whereas any other functions will be maximized. This scoring type was deprecated in version 0.9.1 and will be removed in version 0.11.\n\n\n\n\n\n\nmy_module.scorer_name\n: You can also use a custom \nscore_func(y_true, y_pred)\n or \nscorer(estimator, X, y)\n function through the command line by adding the argument \n-scoring my_module.scorer\n to your command-line call. TPOT will import your module and use the custom scoring function from there. TPOT will include your current working directory when importing the module, so you can place it in the same directory where you are going to run TPOT.\nExample: \n-scoring sklearn.metrics.auc\n will use the function auc from sklearn.metrics module.\n\n\n\n\n\n\nBuilt-in TPOT configurations\n\n\nTPOT comes with a handful of default operators and parameter configurations that we believe work well for optimizing machine learning pipelines. Below is a list of the current built-in configurations that come with TPOT.\n\n\n\n\n\n\nConfiguration Name\n\n\nDescription\n\n\nOperators\n\n\n\n\n\n\n\nDefault TPOT\n\n\nTPOT will search over a broad range of preprocessors, feature constructors, feature selectors, models, and parameters to find a series of operators that minimize the error of the model predictions. Some of these operators are complex and may take a long time to run, especially on larger datasets.\n\n\n\nNote: This is the default configuration for TPOT.\n To use this configuration, use the default value (None) for the config_dict parameter.\n\n\nClassification\n\n\n\n\nRegression\n\n\n\n\n\n\n\nTPOT light\n\n\nTPOT will search over a restricted range of preprocessors, feature constructors, feature selectors, models, and parameters to find a series of operators that minimize the error of the model predictions. Only simpler and fast-running operators will be used in these pipelines, so TPOT light is useful for finding quick and simple pipelines for a classification or regression problem.\n\n\nThis configuration works for both the TPOTClassifier and TPOTRegressor.\n\n\nClassification\n\n\n\n\nRegression\n\n\n\n\n\n\n\nTPOT MDR\n\n\nTPOT will search over a series of feature selectors and \nMultifactor Dimensionality Reduction\n models to find a series of operators that maximize prediction accuracy. The TPOT MDR configuration is specialized for \ngenome-wide association studies (GWAS)\n, and is described in detail online \nhere\n.\n\n\nNote that TPOT MDR may be slow to run because the feature selection routines are computationally expensive, especially on large datasets.\n\n\nClassification\n\n\n\n\nRegression\n\n\n\n\n\n\n\nTPOT sparse\n\n\nTPOT uses a configuration dictionary with a one-hot encoder and the operators normally included in TPOT that also support sparse matrices.\n\n\nThis configuration works for both the TPOTClassifier and TPOTRegressor.\n\n\nClassification\n\n\n\n\nRegression\n\n\n\n\n\n\n\n\nTo use any of these configurations, simply pass the string name of the configuration to the \nconfig_dict\n parameter (or \n-config\n on the command line). For example, to use the \"TPOT light\" configuration:\n\n\nfrom tpot import TPOTClassifier\nfrom sklearn.datasets import load_digits\nfrom sklearn.model_selection import train_test_split\n\ndigits = load_digits()\nX_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target,\n train_size=0.75, test_size=0.25)\n\ntpot = TPOTClassifier(generations=5, population_size=20, verbosity=2,\n config_dict='TPOT light')\ntpot.fit(X_train, y_train)\nprint(tpot.score(X_test, y_test))\ntpot.export('tpot_mnist_pipeline.py')\n\n\n\n\n\nCustomizing TPOT's operators and parameters\n\n\nBeyond the default configurations that come with TPOT, in some cases it is useful to limit the algorithms and parameters that TPOT considers. For that reason, we allow users to provide TPOT with a custom configuration for its operators and parameters.\n\n\nThe custom TPOT configuration must be in nested dictionary format, where the first level key is the path and name of the operator (e.g., \nsklearn.naive_bayes.MultinomialNB\n) and the second level key is the corresponding parameter name for that operator (e.g., \nfit_prior\n). The second level key should point to a list of parameter values for that parameter, e.g., \n'fit_prior': [True, False]\n.\n\n\nFor a simple example, the configuration could be:\n\n\ntpot_config = {\n 'sklearn.naive_bayes.GaussianNB': {\n },\n\n 'sklearn.naive_bayes.BernoulliNB': {\n 'alpha': [1e-3, 1e-2, 1e-1, 1., 10., 100.],\n 'fit_prior': [True, False]\n },\n\n 'sklearn.naive_bayes.MultinomialNB': {\n 'alpha': [1e-3, 1e-2, 1e-1, 1., 10., 100.],\n 'fit_prior': [True, False]\n }\n}\n\n\n\n\nin which case TPOT would only consider pipelines containing \nGaussianNB\n, \nBernoulliNB\n, \nMultinomialNB\n, and tune those algorithm's parameters in the ranges provided. This dictionary can be passed directly within the code to the \nTPOTClassifier\n/\nTPOTRegressor\n \nconfig_dict\n parameter, described above. For example:\n\n\nfrom tpot import TPOTClassifier\nfrom sklearn.datasets import load_digits\nfrom sklearn.model_selection import train_test_split\n\ndigits = load_digits()\nX_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target,\n train_size=0.75, test_size=0.25)\n\ntpot_config = {\n 'sklearn.naive_bayes.GaussianNB': {\n },\n\n 'sklearn.naive_bayes.BernoulliNB': {\n 'alpha': [1e-3, 1e-2, 1e-1, 1., 10., 100.],\n 'fit_prior': [True, False]\n },\n\n 'sklearn.naive_bayes.MultinomialNB': {\n 'alpha': [1e-3, 1e-2, 1e-1, 1., 10., 100.],\n 'fit_prior': [True, False]\n }\n}\n\ntpot = TPOTClassifier(generations=5, population_size=20, verbosity=2,\n config_dict=tpot_config)\ntpot.fit(X_train, y_train)\nprint(tpot.score(X_test, y_test))\ntpot.export('tpot_mnist_pipeline.py')\n\n\n\n\nCommand-line users must create a separate \n.py\n file with the custom configuration and provide the path to the file to the \ntpot\n call. For example, if the simple example configuration above is saved in \ntpot_classifier_config.py\n, that configuration could be used on the command line with the command:\n\n\ntpot data/mnist.csv -is , -target class -config tpot_classifier_config.py -g 5 -p 20 -v 2 -o tpot_exported_pipeline.py\n\n\n\n\nWhen using the command-line interface, the configuration file specified in the \n-config\n parameter \nmust\n name its custom TPOT configuration \ntpot_config\n. Otherwise, TPOT will not be able to locate the configuration dictionary.\n\n\nFor more detailed examples of how to customize TPOT's operator configuration, see the default configurations for \nclassification\n and \nregression\n in TPOT's source code.\n\n\nNote that you must have all of the corresponding packages for the operators installed on your computer, otherwise TPOT will not be able to use them. For example, if XGBoost is not installed on your computer, then TPOT will simply not import nor use XGBoost in the pipelines it considers.\n\n\nTemplate option in TPOT\n\n\nTemplate option provides a way to specify a desired structure for machine learning pipeline, which may reduce TPOT computation time and potentially provide more interpretable results. Current implementation only supports linear pipelines.\n\n\nBelow is a simple example to use \ntemplate\n option. The pipelines generated/evaluated in TPOT will follow this structure: 1st step is a feature selector (a subclass of \nSelectorMixin\n), 2nd step is a feature transformer (a subclass of \nTransformerMixin\n) and 3rd step is a classifier for classification (a subclass of \nClassifierMixin\n). The last step must be \nClassifier\n for \nTPOTClassifier\n's template but \nRegressor\n for \nTPOTRegressor\n. \nNote: although \nSelectorMixin\n is subclass of \nTransformerMixin\n in scikit-leawrn, but \nTransformer\n in this option excludes those subclasses of \nSelectorMixin\n.\n\n\ntpot_obj = TPOTClassifier(\n template='Selector-Transformer-Classifier'\n )\n\n\n\n\nIf a specific operator, e.g. \nSelectPercentile\n, is prefered to used in the 1st step of pipeline, the template can be defined like 'SelectPercentile-Transformer-Classifier'.\n\n\nFeatureSetSelector in TPOT\n\n\nFeatureSetSelector\n is a special new operator in TPOT. This operator enables feature selection based on \npriori\n export knowledge. For example, in RNA-seq gene expression analysis, this operator can be used to select one or more gene (feature) set(s) based on GO (Gene Ontology) terms or annotated gene sets Molecular Signatures Database (\nMSigDB\n) in the 1st step of pipeline via \ntemplate\n option above, in order to reduce dimensions and TPOT computation time. This operator requires a dataset list in csv format. In this csv file, there are only three columns: 1st column is feature set names, 2nd column is the total number of features in one set and 3rd column is a list of feature names (if input X is pandas.DataFrame) or indexes (if input X is numpy.ndarray) delimited by \";\". Below is a example how to use this operator in TPOT.\n\n\nPlease check our \npreprint paper\n for more details.\n\n\nfrom tpot import TPOTClassifier\nimport numpy as np\nimport pandas as pd\nfrom tpot.config import classifier_config_dict\ntest_data = pd.read_csv(\"https://raw.githubusercontent.com/EpistasisLab/tpot/master/tests/tests.csv\")\ntest_X = test_data.drop(\"class\", axis=1)\ntest_y = test_data['class']\n\n# add FeatureSetSelector into tpot configuration\nclassifier_config_dict['tpot.builtins.FeatureSetSelector'] = {\n 'subset_list': ['https://raw.githubusercontent.com/EpistasisLab/tpot/master/tests/subset_test.csv'],\n 'sel_subset': [0,1] # select only one feature set, a list of index of subset in the list above\n #'sel_subset': list(combinations(range(3), 2)) # select two feature sets\n}\n\n\ntpot = TPOTClassifier(generations=5,\n population_size=50, verbosity=2,\n template='FeatureSetSelector-Transformer-Classifier',\n config_dict=classifier_config_dict)\ntpot.fit(test_X, test_y)\n\n\n\n\nPipeline caching in TPOT\n\n\nWith the \nmemory\n parameter, pipelines can cache the results of each transformer after fitting them. This feature is used to avoid repeated computation by transformers within a pipeline if the parameters and input data are identical to another fitted pipeline during optimization process. TPOT allows users to specify a custom directory path or \nsklearn.external.joblib.Memory\n in case they want to re-use the memory cache in future TPOT runs (or a \nwarm_start\n run).\n\n\nThere are three methods for enabling memory caching in TPOT:\n\n\nfrom tpot import TPOTClassifier\nfrom tempfile import mkdtemp\nfrom sklearn.externals.joblib import Memory\nfrom shutil import rmtree\n\n# Method 1, auto mode: TPOT uses memory caching with a temporary directory and cleans it up upon shutdown\ntpot = TPOTClassifier(memory='auto')\n\n# Method 2, with a custom directory for memory caching\ntpot = TPOTClassifier(memory='/to/your/path')\n\n# Method 3, with a Memory object\ncachedir = mkdtemp() # Create a temporary folder\nmemory = Memory(cachedir=cachedir, verbose=0)\ntpot = TPOTClassifier(memory=memory)\n\n# Clear the cache directory when you don't need it anymore\nrmtree(cachedir)\n\n\n\n\nNote: TPOT does NOT clean up memory caches if users set a custom directory path or Memory object. We recommend that you clean up the memory caches when you don't need it anymore.\n\n\nCrash/freeze issue with n_jobs > 1 under OSX or Linux\n\n\nInternally, TPOT uses \njoblib\n to fit estimators in parallel.\nThis is the same parallelization framework used by scikit-learn. But it may crash/freeze with n_jobs > 1 under OSX or Linux \nas scikit-learn does\n, especially with large datasets.\n\n\nOne solution is to configure Python's \nmultiprocessing\n module to use the \nforkserver\n start method (instead of the default \nfork\n) to manage the process pools. You can enable the \nforkserver\n mode globally for your program by putting the following codes into your main script:\n\n\nimport multiprocessing\n\n# other imports, custom code, load data, define model...\n\nif __name__ == '__main__':\n multiprocessing.set_start_method('forkserver')\n\n # call scikit-learn utils or tpot utils with n_jobs > 1 here\n\n\n\n\nMore information about these start methods can be found in the \nmultiprocessing documentation\n.\n\n\nParallel Training with Dask\n\n\nFor large problems or working on Jupyter notebook, we highly recommend that you can distribute the work on a \nDask\n cluster.\nThe \ndask-examples binder\n has a runnable example\nwith a small dask cluster.\n\n\nTo use your Dask cluster to fit a TPOT model, specify the \nuse_dask\n keyword when you create the TPOT estimator. \nNote: if \nuse_dask=True\n, TPOT will use as many cores as available on the your Dask cluster. If \nn_jobs\n is specified, then it will control the chunk size (10*\nn_jobs\n if it is less then offspring size) of parallel training. \n\n\nestimator = TPOTEstimator(use_dask=True, n_jobs=-1)\n\n\n\n\nThis will use use all the workers on your cluster to do the training, and use \nDask-ML's pipeline rewriting\n to avoid re-fitting estimators multiple times on the same set of data.\nIt will also provide fine-grained diagnostics in the \ndistributed scheduler UI\n.\n\n\nAlternatively, Dask implements a joblib backend.\nYou can instruct TPOT to use the distribued backend during training by specifying a \njoblib.parallel_backend\n:\n\n\nfrom sklearn.externals import joblib\nimport distributed.joblib\nfrom dask.distributed import Client\n\n# connect to the cluster\nclient = Client('schedueler-address')\n\n# create the estimator normally\nestimator = TPOTClassifier(n_jobs=-1)\n\n# perform the fit in this context manager\nwith joblib.parallel_backend(\"dask\"):\n estimator.fit(X, y)\n\n\n\n\nSee \ndask's distributed joblib integration\n for more.", "title": "Using TPOT" }, { @@ -27,7 +27,7 @@ }, { "location": "/using/#tpot-on-the-command-line", - "text": "To use TPOT via the command line, enter the following command with a path to the data file: tpot /path_to/data_file.csv An example command-line call to TPOT may look like: tpot data/mnist.csv -is , -target class -o tpot_exported_pipeline.py -g 5 -p 20 -cv 5 -s 42 -v 2 TPOT offers several arguments that can be provided at the command line. To see brief descriptions of these arguments,\nenter the following command: tpot --help Detailed descriptions of the command-line arguments are below. Argument Parameter Valid values Effect -is INPUT_SEPARATOR Any string Character used to separate columns in the input file. -target TARGET_NAME Any string Name of the target column in the input file. -mode TPOT_MODE ['classification', 'regression'] Whether TPOT is being used for a supervised classification or regression problem. -o OUTPUT_FILE String path to a file File to export the code for the final optimized pipeline. -g GENERATIONS Any positive integer Number of iterations to run the pipeline optimization process. Generally, TPOT will work better when you give it more generations (and therefore time) to optimize the pipeline. \nTPOT will evaluate POPULATION_SIZE + GENERATIONS x OFFSPRING_SIZE pipelines in total. -p POPULATION_SIZE Any positive integer Number of individuals to retain in the GP population every generation. Generally, TPOT will work better when you give it more individuals (and therefore time) to optimize the pipeline. \nTPOT will evaluate POPULATION_SIZE + GENERATIONS x OFFSPRING_SIZE pipelines in total. -os OFFSPRING_SIZE Any positive integer Number of offspring to produce in each GP generation. \nBy default, OFFSPRING_SIZE = POPULATION_SIZE. -mr MUTATION_RATE [0.0, 1.0] GP mutation rate in the range [0.0, 1.0]. This tells the GP algorithm how many pipelines to apply random changes to every generation. \nWe recommend using the default parameter unless you understand how the mutation rate affects GP algorithms. -xr CROSSOVER_RATE [0.0, 1.0] GP crossover rate in the range [0.0, 1.0]. This tells the GP algorithm how many pipelines to \"breed\" every generation. \nWe recommend using the default parameter unless you understand how the crossover rate affects GP algorithms. -scoring SCORING_FN 'accuracy', 'adjusted_rand_score', 'average_precision', 'balanced_accuracy', 'f1',\n'f1_macro', 'f1_micro', 'f1_samples', 'f1_weighted', 'neg_log_loss', 'neg_mean_absolute_error',\n'neg_mean_squared_error', 'neg_median_absolute_error', 'precision', 'precision_macro', 'precision_micro',\n'precision_samples', 'precision_weighted', 'r2', 'recall', 'recall_macro', 'recall_micro', 'recall_samples',\n'recall_weighted', 'roc_auc', 'my_module.scorer_name*' Function used to evaluate the quality of a given pipeline for the problem. By default, accuracy is used for classification and mean squared error (MSE) is used for regression. \nTPOT assumes that any function with \"error\" or \"loss\" in the name is meant to be minimized, whereas any other functions will be maximized. \nmy_module.scorer_name: You can also specify your own function or a full python path to an existing one. \nSee the section on scoring functions for more details. -cv CV Any integer > 1 Number of folds to evaluate each pipeline over in k-fold cross-validation during the TPOT optimization process. -sub SUBSAMPLE (0.0, 1.0] Subsample ratio of the training instance. Setting it to 0.5 means that TPOT randomly collects half of training samples for pipeline optimization process. -njobs NUM_JOBS Any positive integer or -1 Number of CPUs for evaluating pipelines in parallel during the TPOT optimization process. \nAssigning this to -1 will use as many cores as available on the computer. -maxtime MAX_TIME_MINS Any positive integer How many minutes TPOT has to optimize the pipeline. \nIf provided, this setting will override the \"generations\" parameter and allow TPOT to run until it runs out of time. -maxeval MAX_EVAL_MINS Any positive float How many minutes TPOT has to evaluate a single pipeline. \nSetting this parameter to higher values will allow TPOT to consider more complex pipelines but will also allow TPOT to run longer. -s RANDOM_STATE Any positive integer Random number generator seed for reproducibility. \nSet this seed if you want your TPOT run to be reproducible with the same seed and data set in the future. -config CONFIG_FILE String or file path Operators and parameter configurations in TPOT: Path for configuration file: TPOT will use the path to a configuration file for customizing the operators and parameters that TPOT uses in the optimization process string 'TPOT light', TPOT will use a built-in configuration with only fast models and preprocessors string 'TPOT MDR', TPOT will use a built-in configuration specialized for genomic studies string 'TPOT sparse': TPOT will use a configuration dictionary with a one-hot encoder and the operators normally included in TPOT that also support sparse matrices. \nSee the built-in configurations section for the list of configurations included with TPOT, and the custom configuration section for more information and examples of how to create your own TPOT configurations. -memory MEMORY String or file path If supplied, pipeline will cache each transformer after calling fit. This feature is used to avoid computing the fit transformers within a pipeline if the parameters and input data are identical with another fitted pipeline during optimization process. Memory caching mode in TPOT: Path for a caching directory: TPOT uses memory caching with the provided directory and TPOT does NOT clean the caching directory up upon shutdown. string 'auto': TPOT uses memory caching with a temporary directory and cleans it up upon shutdown. -cf CHECKPOINT_FOLDER Folder path \nIf supplied, a folder you created, in which tpot will periodically save pipelines in pareto front so far while optimizing. \nThis is useful in multiple cases: sudden death before tpot could save an optimized pipeline progress tracking grabbing a pipeline while tpot is working \nExample: \nmkdir my_checkpoints \n-cf ./my_checkpoints -es EARLY_STOP Any positive integer \nHow many generations TPOT checks whether there is no improvement in optimization process. \nEnd optimization process if there is no improvement in the set number of generations. -v VERBOSITY {0, 1, 2, 3} How much information TPOT communicates while it is running. \n0 = none, 1 = minimal, 2 = high, 3 = all. \nA setting of 2 or higher will add a progress bar during the optimization procedure. --no-update-check Flag indicating whether the TPOT version checker should be disabled. --version Show TPOT's version number and exit. --help Show TPOT's help documentation and exit.", + "text": "To use TPOT via the command line, enter the following command with a path to the data file: tpot /path_to/data_file.csv An example command-line call to TPOT may look like: tpot data/mnist.csv -is , -target class -o tpot_exported_pipeline.py -g 5 -p 20 -cv 5 -s 42 -v 2 TPOT offers several arguments that can be provided at the command line. To see brief descriptions of these arguments,\nenter the following command: tpot --help Detailed descriptions of the command-line arguments are below. Argument Parameter Valid values Effect -is INPUT_SEPARATOR Any string Character used to separate columns in the input file. -target TARGET_NAME Any string Name of the target column in the input file. -mode TPOT_MODE ['classification', 'regression'] Whether TPOT is being used for a supervised classification or regression problem. -o OUTPUT_FILE String path to a file File to export the code for the final optimized pipeline. -g GENERATIONS Any positive integer Number of iterations to run the pipeline optimization process. Generally, TPOT will work better when you give it more generations (and therefore time) to optimize the pipeline. \nTPOT will evaluate POPULATION_SIZE + GENERATIONS x OFFSPRING_SIZE pipelines in total. -p POPULATION_SIZE Any positive integer Number of individuals to retain in the GP population every generation. Generally, TPOT will work better when you give it more individuals (and therefore time) to optimize the pipeline. \nTPOT will evaluate POPULATION_SIZE + GENERATIONS x OFFSPRING_SIZE pipelines in total. -os OFFSPRING_SIZE Any positive integer Number of offspring to produce in each GP generation. \nBy default, OFFSPRING_SIZE = POPULATION_SIZE. -mr MUTATION_RATE [0.0, 1.0] GP mutation rate in the range [0.0, 1.0]. This tells the GP algorithm how many pipelines to apply random changes to every generation. \nWe recommend using the default parameter unless you understand how the mutation rate affects GP algorithms. -xr CROSSOVER_RATE [0.0, 1.0] GP crossover rate in the range [0.0, 1.0]. This tells the GP algorithm how many pipelines to \"breed\" every generation. \nWe recommend using the default parameter unless you understand how the crossover rate affects GP algorithms. -scoring SCORING_FN 'accuracy', 'adjusted_rand_score', 'average_precision', 'balanced_accuracy', 'f1',\n'f1_macro', 'f1_micro', 'f1_samples', 'f1_weighted', 'neg_log_loss', 'neg_mean_absolute_error',\n'neg_mean_squared_error', 'neg_median_absolute_error', 'precision', 'precision_macro', 'precision_micro',\n'precision_samples', 'precision_weighted', 'r2', 'recall', 'recall_macro', 'recall_micro', 'recall_samples',\n'recall_weighted', 'roc_auc', 'my_module.scorer_name*' Function used to evaluate the quality of a given pipeline for the problem. By default, accuracy is used for classification and mean squared error (MSE) is used for regression. \nTPOT assumes that any function with \"error\" or \"loss\" in the name is meant to be minimized, whereas any other functions will be maximized. \nmy_module.scorer_name: You can also specify your own function or a full python path to an existing one. \nSee the section on scoring functions for more details. -cv CV Any integer > 1 Number of folds to evaluate each pipeline over in k-fold cross-validation during the TPOT optimization process. -sub SUBSAMPLE (0.0, 1.0] Subsample ratio of the training instance. Setting it to 0.5 means that TPOT randomly collects half of training samples for pipeline optimization process. -njobs NUM_JOBS Any positive integer or -1 Number of CPUs for evaluating pipelines in parallel during the TPOT optimization process. \nAssigning this to -1 will use as many cores as available on the computer. For n_jobs below -1, (n_cpus + 1 + n_jobs) are used. Thus for n_jobs = -2, all CPUs but one are used. -maxtime MAX_TIME_MINS Any positive integer How many minutes TPOT has to optimize the pipeline. \nIf provided, this setting will override the \"generations\" parameter and allow TPOT to run until it runs out of time. -maxeval MAX_EVAL_MINS Any positive float How many minutes TPOT has to evaluate a single pipeline. \nSetting this parameter to higher values will allow TPOT to consider more complex pipelines but will also allow TPOT to run longer. -s RANDOM_STATE Any positive integer Random number generator seed for reproducibility. \nSet this seed if you want your TPOT run to be reproducible with the same seed and data set in the future. -config CONFIG_FILE String or file path Operators and parameter configurations in TPOT: Path for configuration file: TPOT will use the path to a configuration file for customizing the operators and parameters that TPOT uses in the optimization process string 'TPOT light', TPOT will use a built-in configuration with only fast models and preprocessors string 'TPOT MDR', TPOT will use a built-in configuration specialized for genomic studies string 'TPOT sparse': TPOT will use a configuration dictionary with a one-hot encoder and the operators normally included in TPOT that also support sparse matrices. \nSee the built-in configurations section for the list of configurations included with TPOT, and the custom configuration section for more information and examples of how to create your own TPOT configurations. -template TEMPLATE String Template of predefined pipeline structure. The option is for specifying a desired structure for the machine learning pipeline evaluated in TPOT. So far this option only supports linear pipeline structure. Each step in the pipeline should be a main class of operators (Selector, Transformer, Classifier or Regressor) or a specific operator (e.g. `SelectPercentile`) defined in TPOT operator configuration. If one step is a main class, TPOT will randomly assign all subclass operators (subclasses of [`SelectorMixin`](https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/feature_selection/base.py#L17), [`TransformerMixin`](https://scikit-learn.org/stable/modules/generated/sklearn.base.TransformerMixin.html), [`ClassifierMixin`](https://scikit-learn.org/stable/modules/generated/sklearn.base.ClassifierMixin.html) or [`RegressorMixin`](https://scikit-learn.org/stable/modules/generated/sklearn.base.RegressorMixin.html) in scikit-learn) to that step. Steps in the template are delimited by \"-\", e.g. \"SelectPercentile-Transformer-Classifier\". By default value of template is \"RandomTree\", TPOT generates tree-based pipeline randomly.\n\nSee the template option in tpot section for more details. -memory MEMORY String or file path If supplied, pipeline will cache each transformer after calling fit. This feature is used to avoid computing the fit transformers within a pipeline if the parameters and input data are identical with another fitted pipeline during optimization process. Memory caching mode in TPOT: Path for a caching directory: TPOT uses memory caching with the provided directory and TPOT does NOT clean the caching directory up upon shutdown. string 'auto': TPOT uses memory caching with a temporary directory and cleans it up upon shutdown. -cf CHECKPOINT_FOLDER Folder path \nIf supplied, a folder you created, in which tpot will periodically save pipelines in pareto front so far while optimizing. \nThis is useful in multiple cases: sudden death before tpot could save an optimized pipeline progress tracking grabbing a pipeline while tpot is working \nExample: \nmkdir my_checkpoints \n-cf ./my_checkpoints -es EARLY_STOP Any positive integer \nHow many generations TPOT checks whether there is no improvement in optimization process. \nEnd optimization process if there is no improvement in the set number of generations. -v VERBOSITY {0, 1, 2, 3} How much information TPOT communicates while it is running. \n0 = none, 1 = minimal, 2 = high, 3 = all. \nA setting of 2 or higher will add a progress bar during the optimization procedure. --no-update-check Flag indicating whether the TPOT version checker should be disabled. --version Show TPOT's version number and exit. --help Show TPOT's help documentation and exit.", "title": "TPOT on the command line" }, { @@ -45,6 +45,16 @@ "text": "Beyond the default configurations that come with TPOT, in some cases it is useful to limit the algorithms and parameters that TPOT considers. For that reason, we allow users to provide TPOT with a custom configuration for its operators and parameters. The custom TPOT configuration must be in nested dictionary format, where the first level key is the path and name of the operator (e.g., sklearn.naive_bayes.MultinomialNB ) and the second level key is the corresponding parameter name for that operator (e.g., fit_prior ). The second level key should point to a list of parameter values for that parameter, e.g., 'fit_prior': [True, False] . For a simple example, the configuration could be: tpot_config = {\n 'sklearn.naive_bayes.GaussianNB': {\n },\n\n 'sklearn.naive_bayes.BernoulliNB': {\n 'alpha': [1e-3, 1e-2, 1e-1, 1., 10., 100.],\n 'fit_prior': [True, False]\n },\n\n 'sklearn.naive_bayes.MultinomialNB': {\n 'alpha': [1e-3, 1e-2, 1e-1, 1., 10., 100.],\n 'fit_prior': [True, False]\n }\n} in which case TPOT would only consider pipelines containing GaussianNB , BernoulliNB , MultinomialNB , and tune those algorithm's parameters in the ranges provided. This dictionary can be passed directly within the code to the TPOTClassifier / TPOTRegressor config_dict parameter, described above. For example: from tpot import TPOTClassifier\nfrom sklearn.datasets import load_digits\nfrom sklearn.model_selection import train_test_split\n\ndigits = load_digits()\nX_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target,\n train_size=0.75, test_size=0.25)\n\ntpot_config = {\n 'sklearn.naive_bayes.GaussianNB': {\n },\n\n 'sklearn.naive_bayes.BernoulliNB': {\n 'alpha': [1e-3, 1e-2, 1e-1, 1., 10., 100.],\n 'fit_prior': [True, False]\n },\n\n 'sklearn.naive_bayes.MultinomialNB': {\n 'alpha': [1e-3, 1e-2, 1e-1, 1., 10., 100.],\n 'fit_prior': [True, False]\n }\n}\n\ntpot = TPOTClassifier(generations=5, population_size=20, verbosity=2,\n config_dict=tpot_config)\ntpot.fit(X_train, y_train)\nprint(tpot.score(X_test, y_test))\ntpot.export('tpot_mnist_pipeline.py') Command-line users must create a separate .py file with the custom configuration and provide the path to the file to the tpot call. For example, if the simple example configuration above is saved in tpot_classifier_config.py , that configuration could be used on the command line with the command: tpot data/mnist.csv -is , -target class -config tpot_classifier_config.py -g 5 -p 20 -v 2 -o tpot_exported_pipeline.py When using the command-line interface, the configuration file specified in the -config parameter must name its custom TPOT configuration tpot_config . Otherwise, TPOT will not be able to locate the configuration dictionary. For more detailed examples of how to customize TPOT's operator configuration, see the default configurations for classification and regression in TPOT's source code. Note that you must have all of the corresponding packages for the operators installed on your computer, otherwise TPOT will not be able to use them. For example, if XGBoost is not installed on your computer, then TPOT will simply not import nor use XGBoost in the pipelines it considers.", "title": "Customizing TPOT's operators and parameters" }, + { + "location": "/using/#template-option-in-tpot", + "text": "Template option provides a way to specify a desired structure for machine learning pipeline, which may reduce TPOT computation time and potentially provide more interpretable results. Current implementation only supports linear pipelines. Below is a simple example to use template option. The pipelines generated/evaluated in TPOT will follow this structure: 1st step is a feature selector (a subclass of SelectorMixin ), 2nd step is a feature transformer (a subclass of TransformerMixin ) and 3rd step is a classifier for classification (a subclass of ClassifierMixin ). The last step must be Classifier for TPOTClassifier 's template but Regressor for TPOTRegressor . Note: although SelectorMixin is subclass of TransformerMixin in scikit-leawrn, but Transformer in this option excludes those subclasses of SelectorMixin . tpot_obj = TPOTClassifier(\n template='Selector-Transformer-Classifier'\n ) If a specific operator, e.g. SelectPercentile , is prefered to used in the 1st step of pipeline, the template can be defined like 'SelectPercentile-Transformer-Classifier'.", + "title": "Template option in TPOT" + }, + { + "location": "/using/#featuresetselector-in-tpot", + "text": "FeatureSetSelector is a special new operator in TPOT. This operator enables feature selection based on priori export knowledge. For example, in RNA-seq gene expression analysis, this operator can be used to select one or more gene (feature) set(s) based on GO (Gene Ontology) terms or annotated gene sets Molecular Signatures Database ( MSigDB ) in the 1st step of pipeline via template option above, in order to reduce dimensions and TPOT computation time. This operator requires a dataset list in csv format. In this csv file, there are only three columns: 1st column is feature set names, 2nd column is the total number of features in one set and 3rd column is a list of feature names (if input X is pandas.DataFrame) or indexes (if input X is numpy.ndarray) delimited by \";\". Below is a example how to use this operator in TPOT. Please check our preprint paper for more details. from tpot import TPOTClassifier\nimport numpy as np\nimport pandas as pd\nfrom tpot.config import classifier_config_dict\ntest_data = pd.read_csv(\"https://raw.githubusercontent.com/EpistasisLab/tpot/master/tests/tests.csv\")\ntest_X = test_data.drop(\"class\", axis=1)\ntest_y = test_data['class']\n\n# add FeatureSetSelector into tpot configuration\nclassifier_config_dict['tpot.builtins.FeatureSetSelector'] = {\n 'subset_list': ['https://raw.githubusercontent.com/EpistasisLab/tpot/master/tests/subset_test.csv'],\n 'sel_subset': [0,1] # select only one feature set, a list of index of subset in the list above\n #'sel_subset': list(combinations(range(3), 2)) # select two feature sets\n}\n\n\ntpot = TPOTClassifier(generations=5,\n population_size=50, verbosity=2,\n template='FeatureSetSelector-Transformer-Classifier',\n config_dict=classifier_config_dict)\ntpot.fit(test_X, test_y)", + "title": "FeatureSetSelector in TPOT" + }, { "location": "/using/#pipeline-caching-in-tpot", "text": "With the memory parameter, pipelines can cache the results of each transformer after fitting them. This feature is used to avoid repeated computation by transformers within a pipeline if the parameters and input data are identical to another fitted pipeline during optimization process. TPOT allows users to specify a custom directory path or sklearn.external.joblib.Memory in case they want to re-use the memory cache in future TPOT runs (or a warm_start run). There are three methods for enabling memory caching in TPOT: from tpot import TPOTClassifier\nfrom tempfile import mkdtemp\nfrom sklearn.externals.joblib import Memory\nfrom shutil import rmtree\n\n# Method 1, auto mode: TPOT uses memory caching with a temporary directory and cleans it up upon shutdown\ntpot = TPOTClassifier(memory='auto')\n\n# Method 2, with a custom directory for memory caching\ntpot = TPOTClassifier(memory='/to/your/path')\n\n# Method 3, with a Memory object\ncachedir = mkdtemp() # Create a temporary folder\nmemory = Memory(cachedir=cachedir, verbose=0)\ntpot = TPOTClassifier(memory=memory)\n\n# Clear the cache directory when you don't need it anymore\nrmtree(cachedir) Note: TPOT does NOT clean up memory caches if users set a custom directory path or Memory object. We recommend that you clean up the memory caches when you don't need it anymore.", @@ -57,22 +67,22 @@ }, { "location": "/using/#parallel-training-with-dask", - "text": "For large problems or working on Jupyter notebook, we highly recommend that you can distribute the work on a Dask cluster.\nThe dask-examples binder has a runnable example\nwith a small dask cluster. To use your Dask cluster to fit a TPOT model, specify the use_dask keyword when you create the TPOT estimator. Note: if use_dask=True , TPOT will use as many cores as available on the your Dask cluster regardless of whether n_jobs is specified. estimator = TPOTEstimator(use_dask=True) This will use use all the workers on your cluster to do the training, and use Dask-ML's pipeline rewriting to avoid re-fitting estimators multiple times on the same set of data.\nIt will also provide fine-grained diagnostics in the distributed scheduler UI . Alternatively, Dask implements a joblib backend.\nYou can instruct TPOT to use the distribued backend during training by specifying a joblib.parallel_backend : from sklearn.externals import joblib\nimport distributed.joblib\nfrom dask.distributed import Client\n\n# connect to the cluster\nclient = Client('schedueler-address')\n\n# create the estimator normally\nestimator = TPOTClassifier(n_jobs=-1)\n\n# perform the fit in this context manager\nwith joblib.parallel_backend(\"dask\"):\n estimator.fit(X, y) See dask's distributed joblib integration for more.", + "text": "For large problems or working on Jupyter notebook, we highly recommend that you can distribute the work on a Dask cluster.\nThe dask-examples binder has a runnable example\nwith a small dask cluster. To use your Dask cluster to fit a TPOT model, specify the use_dask keyword when you create the TPOT estimator. Note: if use_dask=True , TPOT will use as many cores as available on the your Dask cluster. If n_jobs is specified, then it will control the chunk size (10* n_jobs if it is less then offspring size) of parallel training. estimator = TPOTEstimator(use_dask=True, n_jobs=-1) This will use use all the workers on your cluster to do the training, and use Dask-ML's pipeline rewriting to avoid re-fitting estimators multiple times on the same set of data.\nIt will also provide fine-grained diagnostics in the distributed scheduler UI . Alternatively, Dask implements a joblib backend.\nYou can instruct TPOT to use the distribued backend during training by specifying a joblib.parallel_backend : from sklearn.externals import joblib\nimport distributed.joblib\nfrom dask.distributed import Client\n\n# connect to the cluster\nclient = Client('schedueler-address')\n\n# create the estimator normally\nestimator = TPOTClassifier(n_jobs=-1)\n\n# perform the fit in this context manager\nwith joblib.parallel_backend(\"dask\"):\n estimator.fit(X, y) See dask's distributed joblib integration for more.", "title": "Parallel Training with Dask" }, { "location": "/api/", - "text": "Classification\n\n\nclass\n tpot.\nTPOTClassifier\n(\ngenerations\n=100, \npopulation_size\n=100,\n \noffspring_size\n=None, \nmutation_rate\n=0.9,\n \ncrossover_rate\n=0.1,\n \nscoring\n='accuracy', \ncv\n=5,\n \nsubsample\n=1.0, \nn_jobs\n=1,\n \nmax_time_mins\n=None, \nmax_eval_time_mins\n=5,\n \nrandom_state\n=None, \nconfig_dict\n=None,\n \nwarm_start\n=False,\n \nmemory\n=None,\n \nuse_dask\n=False,\n \nperiodic_checkpoint_folder\n=None,\n \nearly_stop\n=None,\n \nverbosity\n=0,\n \ndisable_update_check\n=False\n)\n\n\n\nsource\n\n\n\nAutomated machine learning for supervised classification tasks.\n\n\nThe TPOTClassifier performs an intelligent search over machine learning pipelines that can contain supervised classification models,\npreprocessors, feature selection techniques, and any other estimator or transformer that follows the \nscikit-learn API\n.\nThe TPOTClassifier will also search over the hyperparameters of all objects in the pipeline.\n\n\nBy default, TPOTClassifier will search over a broad range of supervised classification algorithms, transformers, and their parameters.\nHowever, the algorithms, transformers, and hyperparameters that the TPOTClassifier searches over can be fully customized using the \nconfig_dict\n parameter.\n\n\nRead more in the \nUser Guide\n.\n\n\n\n\n\n\nParameters:\n\n\n\n\ngenerations\n: int, optional (default=100)\n\n\nNumber of iterations to the run pipeline optimization process. Must be a positive number.\n\n\nGenerally, TPOT will work better when you give it more generations (and therefore time) to optimize the pipeline.\n\n\nTPOT will evaluate \npopulation_size\n + \ngenerations\n \u00d7 \noffspring_size\n pipelines in total.\n\n\n\n\npopulation_size\n: int, optional (default=100)\n\n\nNumber of individuals to retain in the genetic programming population every generation. Must be a positive number.\n\n\nGenerally, TPOT will work better when you give it more individuals with which to optimize the pipeline.\n\n\n\n\noffspring_size\n: int, optional (default=None)\n\n\nNumber of offspring to produce in each genetic programming generation. Must be a positive number. By default, the number of offspring is equal to the number of population size.\n\n\n\n\nmutation_rate\n: float, optional (default=0.9)\n\n\nMutation rate for the genetic programming algorithm in the range [0.0, 1.0]. This parameter tells the GP algorithm how many pipelines to apply random changes to every generation.\n\n\n\nmutation_rate\n + \ncrossover_rate\n cannot exceed 1.0.\n\n\nWe recommend using the default parameter unless you understand how the mutation rate affects GP algorithms.\n\n\n\n\ncrossover_rate\n: float, optional (default=0.1)\n\n\nCrossover rate for the genetic programming algorithm in the range [0.0, 1.0]. This parameter tells the genetic programming algorithm how many pipelines to \"breed\" every generation.\n\n\n\nmutation_rate\n + \ncrossover_rate\n cannot exceed 1.0.\n\n\nWe recommend using the default parameter unless you understand how the crossover rate affects GP algorithms.\n\n\n\n\nscoring\n: string or callable, optional (default='accuracy')\n\n\nFunction used to evaluate the quality of a given pipeline for the classification problem. The following built-in scoring functions can be used:\n\n\n'accuracy', 'adjusted_rand_score', 'average_precision', 'balanced_accuracy', 'f1', 'f1_macro', 'f1_micro', 'f1_samples', 'f1_weighted', 'neg_log_loss','precision',\n'precision_macro', 'precision_micro', 'precision_samples', 'precision_weighted', 'recall', 'recall_macro', 'recall_micro', 'recall_samples', 'recall_weighted', 'roc_auc'\n\n\nIf you would like to use a custom scorer, you can pass the callable object/function with signature \nscorer(estimator, X, y)\n.\n\n\nIf you would like to use a metric function, you can pass the callable function to this parameter with the signature \nscore_func(y_true, y_pred)\n. TPOT assumes that any function with \"error\" or \"loss\" in the function name is meant to be minimized, whereas any other functions will be maximized. This scoring type was deprecated in version 0.9.1 and will be removed in version 0.11.\n\n\nSee the section on \nscoring functions\n for more details.\n\n\n\n\n\ncv\n: int, cross-validation generator, or an iterable, optional (default=5)\n\n\nCross-validation strategy used when evaluating pipelines.\n\n\nPossible inputs:\n\n\n\ninteger, to specify the number of folds in a StratifiedKFold,\n\n\nAn object to be used as a cross-validation generator, or\n\n\nAn iterable yielding train/test splits.\n\n\n\n\n\nsubsample\n: float, optional (default=1.0)\n\n\nFraction of training samples that are used during the TPOT optimization process. Must be in the range (0.0, 1.0].\n\n\nSetting \nsubsample\n=0.5 tells TPOT to use a random subsample of half of the training data. This subsample will remain the same during the entire pipeline optimization process.\n\n\n\n\nn_jobs\n: integer, optional (default=1)\n\n\nNumber of processes to use in parallel for evaluating pipelines during the TPOT optimization process.\n\n\nSetting \nn_jobs\n=-1 will use as many cores as available on the computer. Beware that using multiple processes on the same machine may cause memory issues for large datasets\n\n\n\n\nmax_time_mins\n: integer or None, optional (default=None)\n\n\nHow many minutes TPOT has to optimize the pipeline.\n\n\nIf not None, this setting will override the \ngenerations\n parameter and allow TPOT to run until \nmax_time_mins\n minutes elapse.\n\n\n\n\nmax_eval_time_mins\n: float, optional (default=5)\n\n\nHow many minutes TPOT has to evaluate a single pipeline.\n\n\nSetting this parameter to higher values will allow TPOT to evaluate more complex pipelines, but will also allow TPOT to run longer. Use this parameter to help prevent TPOT from wasting time on evaluating time-consuming pipelines.\n\n\n\n\nrandom_state\n: integer or None, optional (default=None)\n\n\nThe seed of the pseudo random number generator used in TPOT.\n\n\nUse this parameter to make sure that TPOT will give you the same results each time you run it against the same data set with that seed.\n\n\n\n\nconfig_dict\n: Python dictionary, string, or None, optional (default=None)\n\n\nA configuration dictionary for customizing the operators and parameters that TPOT searches in the optimization process.\n\n\nPossible inputs are:\n\n\n\nPython dictionary, TPOT will use your custom configuration,\n\n\nstring 'TPOT light', TPOT will use a built-in configuration with only fast models and preprocessors, or\n\n\nstring 'TPOT MDR', TPOT will use a built-in configuration specialized for genomic studies, or\n\n\nstring 'TPOT sparse': TPOT will use a configuration dictionary with a one-hot encoder and the operators normally included in TPOT that also support sparse matrices, or\n\n\nNone, TPOT will use the default TPOTClassifier configuration.\n\n\n\nSee the \nbuilt-in configurations\n section for the list of configurations included with TPOT, and the \ncustom configuration\n section for more information and examples of how to create your own TPOT configurations.\n\n\n\n\nwarm_start\n: boolean, optional (default=False)\n\n\nFlag indicating whether the TPOT instance will reuse the population from previous calls to \nfit()\n.\n\n\nSetting \nwarm_start\n=True can be useful for running TPOT for a short time on a dataset, checking the results, then resuming the TPOT run from where it left off.\n\n\n\n\nmemory\n: a sklearn.external.joblib.Memory object or string, optional (default=None)\n\n\nIf supplied, pipeline will cache each transformer after calling fit. This feature is used to avoid computing the fit transformers within a pipeline if the parameters and input data are identical with another fitted pipeline during optimization process. More details about memory caching in \nscikit-learn documentation\n\n\n\nPossible inputs are:\n\n\n\nString 'auto': TPOT uses memory caching with a temporary directory and cleans it up upon shutdown, or\n\n\nPath of a caching directory, TPOT uses memory caching with the provided directory and TPOT does NOT clean the caching directory up upon shutdown, or\n\n\nMemory object, TPOT uses the instance of sklearn.external.joblib.Memory for memory caching and TPOT does NOT clean the caching directory up upon shutdown, or\n\n\nNone, TPOT does not use memory caching.\n\n\n\n\n\n\n\nuse_dask\n: boolean, optional (default: False)\n\n\nWhether to use Dask-ML's pipeline optimiziations. This avoid re-fitting\nthe same estimator on the same split of data multiple times. It\nwill also provide more detailed diagnostics when using Dask's\ndistributed scheduler.\n\n\nSee \navoid repeated work\n for more details.\n\n\n\n\nperiodic_checkpoint_folder\n: path string, optional (default: None)\n\n\nIf supplied, a folder in which TPOT will periodically save pipelines in pareto front so far while optimizing.\n\nCurrently once per generation but not more often than once per 30 seconds.\n\nUseful in multiple cases:\n\n\n\nSudden death before TPOT could save optimized pipeline\n\n\nTrack its progress\n\n\nGrab pipelines while it's still optimizing\n\n\n\n\n\n\n\nearly_stop\n: integer, optional (default: None)\n\n\nHow many generations TPOT checks whether there is no improvement in optimization process.\n\n\nEnds the optimization process if there is no improvement in the given number of generations.\n\n\n\n\nverbosity\n: integer, optional (default=0)\n\n\nHow much information TPOT communicates while it's running.\n\n\nPossible inputs are:\n\n\n\n0, TPOT will print nothing,\n\n\n1, TPOT will print minimal information,\n\n\n2, TPOT will print more information and provide a progress bar, or\n\n\n3, TPOT will print everything and provide a progress bar.\n\n\n\n\n\n\n\ndisable_update_check\n: boolean, optional (default=False)\n\n\nFlag indicating whether the TPOT version checker should be disabled.\n\n\nThe update checker will tell you when a new version of TPOT has been released.\n\n\n\n\n\n\n\n\n\n\nAttributes:\n\n\n\n\nfitted_pipeline_\n: scikit-learn Pipeline object\n\n\nThe best pipeline that TPOT discovered during the pipeline optimization process, fitted on the entire training dataset.\n\n\n\n\npareto_front_fitted_pipelines_\n: Python dictionary\n\n\nDictionary containing the all pipelines on the TPOT Pareto front, where the key is the string representation of the pipeline and the value is the corresponding pipeline fitted on the entire training dataset.\n\n\nThe TPOT Pareto front provides a trade-off between pipeline complexity (i.e., the number of steps in the pipeline) and the predictive performance of the pipeline.\n\n\nNote: \npareto_front_fitted_pipelines_\n is only available when \nverbosity\n=3.\n\n\n\n\nevaluated_individuals_\n: Python dictionary\n\n\nDictionary containing all pipelines that were evaluated during the pipeline optimization process, where the key is the string representation of the pipeline and the value is a tuple containing (# of steps in pipeline, accuracy metric for the pipeline).\n\n\nThis attribute is primarily for internal use, but may be useful for looking at the other pipelines that TPOT evaluated.\n\n\n\n\n\n\n\n\n\n\nExample\n\n\nfrom tpot import TPOTClassifier\nfrom sklearn.datasets import load_digits\nfrom sklearn.model_selection import train_test_split\n\ndigits = load_digits()\nX_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target,\n train_size=0.75, test_size=0.25)\n\ntpot = TPOTClassifier(generations=5, population_size=50, verbosity=2)\ntpot.fit(X_train, y_train)\nprint(tpot.score(X_test, y_test))\ntpot.export('tpot_mnist_pipeline.py')\n\n\n\n\nFunctions\n\n\n\n\n\n\nfit\n(features, classes[, sample_weight, groups])\n\n\nRun the TPOT optimization process on the given training data.\n\n\n\n\n\n\n\npredict\n(features)\n\n\nUse the optimized pipeline to predict the classes for a feature set.\n\n\n\n\n\n\n\npredict_proba\n(features)\n\n\nUse the optimized pipeline to estimate the class probabilities for a feature set.\n\n\n\n\n\n\n\nscore\n(testing_features, testing_classes)\n\n\nReturns the optimized pipeline's score on the given testing data using the user-specified scoring function.\n\n\n\n\n\n\n\nexport\n(output_file_name)\n\n\nExport the optimized pipeline as Python code.\n\n\n\n\n\n\n\n\n\nfit(features, classes, sample_weight=None, groups=None)\n\n\n\n\n\nRun the TPOT optimization process on the given training data.\n\n\nUses genetic programming to optimize a machine learning pipeline that maximizes the score on the provided features and target. This pipeline optimization procedure uses internal k-fold cross-validaton to avoid overfitting on the provided data. At the end of the pipeline optimization procedure, the best pipeline is then trained on the entire set of provided samples.\n\n\n\n\n\n\n\nParameters:\n\n\n\n\nfeatures\n: array-like {n_samples, n_features}\n\n\nFeature matrix\n\n\nTPOT and all scikit-learn algorithms assume that the features will be numerical and there will be no missing values.\nAs such, when a feature matrix is provided to TPOT, all missing values will automatically be replaced (i.e., imputed)\nusing \nmedian value imputation\n.\n\n\nIf you wish to use a different imputation strategy than median imputation, please make sure to apply imputation to your feature set prior to passing it to TPOT.\n\n\n\n\nclasses\n: array-like {n_samples}\n\n\nList of class labels for prediction\n\n\n\n\nsample_weight\n: array-like {n_samples}, optional\n\n\nPer-sample weights. Higher weights indicate more importance. If specified, sample_weight will be passed to any pipeline element whose fit() function accepts a sample_weight argument. By default, using sample_weight does not affect tpot's scoring functions, which determine preferences between pipelines.\n\n\n\n\ngroups\n: array-like, with shape {n_samples, }, optional\n\n\nGroup labels for the samples used when performing cross-validation.\n\n\nThis parameter should only be used in conjunction with sklearn's Group cross-validation functions, such as \nsklearn.model_selection.GroupKFold\n.\n\n\n\n\n\n\n\n\n\nReturns:\n\n\n\n\nself\n: object\n\n\nReturns a copy of the fitted TPOT object\n\n\n\n\n\n\n\n\n\n\n\n\n\n\npredict(features)\n\n\n\n\n\nUse the optimized pipeline to predict the classes for a feature set.\n\n\n\n\n\n\n\nParameters:\n\n\n\n\nfeatures\n: array-like {n_samples, n_features}\n\n\nFeature matrix\n\n\n\n\n\n\n\n\n\nReturns:\n\n\n\n\npredictions\n: array-like {n_samples}\n\n\nPredicted classes for the samples in the feature matrix\n\n\n\n\n\n\n\n\n\n\n\n\n\n\npredict_proba(features)\n\n\n\n\n\nUse the optimized pipeline to estimate the class probabilities for a feature set.\n\n\nNote: This function will only work for pipelines whose final classifier supports the \npredict_proba\n function. TPOT will raise an error otherwise.\n\n\n\n\n\n\n\nParameters:\n\n\n\n\nfeatures\n: array-like {n_samples, n_features}\n\n\nFeature matrix\n\n\n\n\n\n\n\n\n\nReturns:\n\n\n\n\npredictions\n: array-like {n_samples, n_classes}\n\n\nThe class probabilities of the input samples\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nscore(testing_features, testing_classes)\n\n\n\n\n\nReturns the optimized pipeline's score on the given testing data using the user-specified scoring function.\n\n\nThe default scoring function for TPOTClassifier is 'accuracy'.\n\n\n\n\n\n\n\nParameters:\n\n\n\n\ntesting_features\n: array-like {n_samples, n_features}\n\n\nFeature matrix of the testing set\n\n\n\n\ntesting_classes\n: array-like {n_samples}\n\n\nList of class labels for prediction in the testing set\n\n\n\n\n\n\n\n\n\nReturns:\n\n\n\n\naccuracy_score\n: float\n\n\nThe estimated test set accuracy according to the user-specified scoring function.\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nexport(output_file_name)\n\n\n\n\n\nExport the optimized pipeline as Python code.\n\n\nSee the \nusage documentation\n for example usage of the export function.\n\n\n\n\n\n\n\nParameters:\n\n\n\n\noutput_file_name\n: string\n\n\nString containing the path and file name of the desired output file\n\n\n\n\n\n\n\nReturns:\n\n\n\nDoes not return anything\n\n\n\n\n\n\n\n\n\n\nRegression\n\n\nclass\n tpot.\nTPOTRegressor\n(\ngenerations\n=100, \npopulation_size\n=100,\n \noffspring_size\n=None, \nmutation_rate\n=0.9,\n \ncrossover_rate\n=0.1,\n \nscoring\n='neg_mean_squared_error', \ncv\n=5,\n \nsubsample\n=1.0, \nn_jobs\n=1,\n \nmax_time_mins\n=None, \nmax_eval_time_mins\n=5,\n \nrandom_state\n=None, \nconfig_dict\n=None,\n \nwarm_start\n=False,\n \nmemory\n=None,\n \nuse_dask\n=False,\n \nperiodic_checkpoint_folder\n=None,\n \nearly_stop\n=None,\n \nverbosity\n=0,\n \ndisable_update_check\n=False\n)\n\n\n\nsource\n\n\n\nAutomated machine learning for supervised regression tasks.\n\n\nThe TPOTRegressor performs an intelligent search over machine learning pipelines that can contain supervised regression models,\npreprocessors, feature selection techniques, and any other estimator or transformer that follows the \nscikit-learn API\n.\nThe TPOTRegressor will also search over the hyperparameters of all objects in the pipeline.\n\n\nBy default, TPOTRegressor will search over a broad range of supervised regression models, transformers, and their hyperparameters.\nHowever, the models, transformers, and parameters that the TPOTRegressor searches over can be fully customized using the \nconfig_dict\n parameter.\n\n\nRead more in the \nUser Guide\n.\n\n\n\n\n\n\nParameters:\n\n\n\n\ngenerations\n: int, optional (default=100)\n\n\nNumber of iterations to the run pipeline optimization process. Must be a positive number.\n\n\nGenerally, TPOT will work better when you give it more generations (and therefore time) to optimize the pipeline.\n\n\nTPOT will evaluate \npopulation_size\n + \ngenerations\n \u00d7 \noffspring_size\n pipelines in total.\n\n\n\n\npopulation_size\n: int, optional (default=100)\n\n\nNumber of individuals to retain in the genetic programming population every generation. Must be a positive number.\n\n\nGenerally, TPOT will work better when you give it more individuals with which to optimize the pipeline.\n\n\n\n\noffspring_size\n: int, optional (default=None)\n\n\nNumber of offspring to produce in each genetic programming generation. Must be a positive number. By default, the number of offspring is equal to the number of population size.\n\n\n\n\nmutation_rate\n: float, optional (default=0.9)\n\n\nMutation rate for the genetic programming algorithm in the range [0.0, 1.0]. This parameter tells the GP algorithm how many pipelines to apply random changes to every generation.\n\n\n\nmutation_rate\n + \ncrossover_rate\n cannot exceed 1.0.\n\n\nWe recommend using the default parameter unless you understand how the mutation rate affects GP algorithms.\n\n\n\n\ncrossover_rate\n: float, optional (default=0.1)\n\n\nCrossover rate for the genetic programming algorithm in the range [0.0, 1.0]. This parameter tells the genetic programming algorithm how many pipelines to \"breed\" every generation.\n\n\n\nmutation_rate\n + \ncrossover_rate\n cannot exceed 1.0.\n\n\nWe recommend using the default parameter unless you understand how the crossover rate affects GP algorithms.\n\n\n\n\nscoring\n: string or callable, optional (default='neg_mean_squared_error')\n\n\nFunction used to evaluate the quality of a given pipeline for the regression problem. The following built-in scoring functions can be used:\n\n\n'neg_median_absolute_error', 'neg_mean_absolute_error', 'neg_mean_squared_error', 'r2'\n\n\nNote that we recommend using the \nneg\n version of mean squared error and related metrics so TPOT will minimize (instead of maximize) the metric.\n\n\nIf you would like to use a custom scorer, you can pass the callable object/function with signature \nscorer(estimator, X, y)\n.\n\n\nIf you would like to use a metric function, you can pass the callable function to this parameter with the signature \nscore_func(y_true, y_pred)\n. TPOT assumes that any function with \"error\" or \"loss\" in the function name is meant to be minimized, whereas any other functions will be maximized. This scoring type was deprecated in version 0.9.1 and will be removed in version 0.11.\n\n\nSee the section on \nscoring functions\n for more details.\n\n\n\n\ncv\n: int, cross-validation generator, or an iterable, optional (default=5)\n\n\nCross-validation strategy used when evaluating pipelines.\n\n\nPossible inputs:\n\n\n\ninteger, to specify the number of folds in a KFold,\n\n\nAn object to be used as a cross-validation generator, or\n\n\nAn iterable yielding train/test splits.\n\n\n\n\n\n\n\nsubsample\n: float, optional (default=1.0)\n\n\nFraction of training samples that are used during the TPOT optimization process. Must be in the range (0.0, 1.0].\n\n\nSetting \nsubsample\n=0.5 tells TPOT to use a random subsample of half of the training data. This subsample will remain the same during the entire pipeline optimization process.\n\n\n\n\nn_jobs\n: integer, optional (default=1)\n\n\nNumber of processes to use in parallel for evaluating pipelines during the TPOT optimization process.\n\n\nSetting \nn_jobs\n=-1 will use as many cores as available on the computer. Beware that using multiple processes on the same machine may cause memory issues for large datasets\n\n\n\n\nmax_time_mins\n: integer or None, optional (default=None)\n\n\nHow many minutes TPOT has to optimize the pipeline.\n\n\nIf not None, this setting will override the \ngenerations\n parameter and allow TPOT to run until \nmax_time_mins\n minutes elapse.\n\n\n\n\nmax_eval_time_mins\n: float, optional (default=5)\n\n\nHow many minutes TPOT has to evaluate a single pipeline.\n\n\nSetting this parameter to higher values will allow TPOT to evaluate more complex pipelines, but will also allow TPOT to run longer. Use this parameter to help prevent TPOT from wasting time on evaluating time-consuming pipelines.\n\n\n\n\nrandom_state\n: integer or None, optional (default=None)\n\n\nThe seed of the pseudo random number generator used in TPOT.\n\n\nUse this parameter to make sure that TPOT will give you the same results each time you run it against the same data set with that seed.\n\n\n\n\nconfig_dict\n: Python dictionary, string, or None, optional (default=None)\n\n\nA configuration dictionary for customizing the operators and parameters that TPOT searches in the optimization process.\n\n\nPossible inputs are:\n\n\n\nPython dictionary, TPOT will use your custom configuration,\n\n\nstring 'TPOT light', TPOT will use a built-in configuration with only fast models and preprocessors, or\n\n\nstring 'TPOT MDR', TPOT will use a built-in configuration specialized for genomic studies, or\n\n\nstring 'TPOT sparse': TPOT will use a configuration dictionary with a one-hot encoder and the operators normally included in TPOT that also support sparse matrices, or\n\n\nNone, TPOT will use the default TPOTRegressor configuration.\n\n\n\nSee the \nbuilt-in configurations\n section for the list of configurations included with TPOT, and the \ncustom configuration\n section for more information and examples of how to create your own TPOT configurations.\n\n\n\n\nwarm_start\n: boolean, optional (default=False)\n\n\nFlag indicating whether the TPOT instance will reuse the population from previous calls to \nfit()\n.\n\n\nSetting \nwarm_start\n=True can be useful for running TPOT for a short time on a dataset, checking the results, then resuming the TPOT run from where it left off.\n\n\n\n\nmemory\n: a sklearn.external.joblib.Memory object or string, optional (default=None)\n\n\nIf supplied, pipeline will cache each transformer after calling fit. This feature is used to avoid computing the fit transformers within a pipeline if the parameters and input data are identical with another fitted pipeline during optimization process. More details about memory caching in \nscikit-learn documentation\n\n\n\nPossible inputs are:\n\n\n\nString 'auto': TPOT uses memory caching with a temporary directory and cleans it up upon shutdown, or\n\n\nPath of a caching directory, TPOT uses memory caching with the provided directory and TPOT does NOT clean the caching directory up upon shutdown, or\n\n\nMemory object, TPOT uses the instance of sklearn.external.joblib.Memory for memory caching and TPOT does NOT clean the caching directory up upon shutdown, or\n\n\nNone, TPOT does not use memory caching.\n\n\n\n\n\n\n\nuse_dask\n: boolean, optional (default: False)\n\n\nWhether to use Dask-ML's pipeline optimiziations. This avoid re-fitting\nthe same estimator on the same split of data multiple times. It\nwill also provide more detailed diagnostics when using Dask's\ndistributed scheduler.\n\n\nSee \navoid repeated work\n for more details.\n\n\n\n\nperiodic_checkpoint_folder\n: path string, optional (default: None)\n\n\nIf supplied, a folder in which TPOT will periodically save pipelines in pareto front so far while optimizing.\n\nCurrently once per generation but not more often than once per 30 seconds.\n\nUseful in multiple cases:\n\n\n\nSudden death before TPOT could save optimized pipeline\n\n\nTrack its progress\n\n\nGrab pipelines while it's still optimizing\n\n\n\n\n\n\n\nearly_stop\n: integer, optional (default: None)\n\n\nHow many generations TPOT checks whether there is no improvement in optimization process.\n\n\nEnds the optimization process if there is no improvement in the given number of generations.\n\n\n\n\nverbosity\n: integer, optional (default=0)\n\n\nHow much information TPOT communicates while it's running.\n\n\nPossible inputs are:\n\n\n\n0, TPOT will print nothing,\n\n\n1, TPOT will print minimal information,\n\n\n2, TPOT will print more information and provide a progress bar, or\n\n\n3, TPOT will print everything and provide a progress bar.\n\n\n\n\n\n\n\ndisable_update_check\n: boolean, optional (default=False)\n\n\nFlag indicating whether the TPOT version checker should be disabled.\n\n\nThe update checker will tell you when a new version of TPOT has been released.\n\n\n\n\n\n\n\n\n\n\nAttributes:\n\n\n\n\nfitted_pipeline_\n: scikit-learn Pipeline object\n\n\nThe best pipeline that TPOT discovered during the pipeline optimization process, fitted on the entire training dataset.\n\n\n\n\npareto_front_fitted_pipelines_\n: Python dictionary\n\n\nDictionary containing the all pipelines on the TPOT Pareto front, where the key is the string representation of the pipeline and the value is the corresponding pipeline fitted on the entire training dataset.\n\n\nThe TPOT Pareto front provides a trade-off between pipeline complexity (i.e., the number of steps in the pipeline) and the predictive performance of the pipeline.\n\n\nNote: \n_pareto_front_fitted_pipelines\n is only available when \nverbosity\n=3.\n\n\n\n\nevaluated_individuals_\n: Python dictionary\n\n\nDictionary containing all pipelines that were evaluated during the pipeline optimization process, where the key is the string representation of the pipeline and the value is a tuple containing (# of steps in pipeline, accuracy metric for the pipeline).\n\n\nThis attribute is primarily for internal use, but may be useful for looking at the other pipelines that TPOT evaluated.\n\n\n\n\n\n\n\n\n\n\nExample\n\n\nfrom tpot import TPOTRegressor\nfrom sklearn.datasets import load_boston\nfrom sklearn.model_selection import train_test_split\n\ndigits = load_boston()\nX_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target,\n train_size=0.75, test_size=0.25)\n\ntpot = TPOTRegressor(generations=5, population_size=50, verbosity=2)\ntpot.fit(X_train, y_train)\nprint(tpot.score(X_test, y_test))\ntpot.export('tpot_boston_pipeline.py')\n\n\n\n\nFunctions\n\n\n\n\n\n\nfit\n(features, target[, sample_weight, groups])\n\n\nRun the TPOT optimization process on the given training data.\n\n\n\n\n\n\n\npredict\n(features)\n\n\nUse the optimized pipeline to predict the target values for a feature set.\n\n\n\n\n\n\n\nscore\n(testing_features, testing_target)\n\n\nReturns the optimized pipeline's score on the given testing data using the user-specified scoring function.\n\n\n\n\n\n\n\nexport\n(output_file_name)\n\n\nExport the optimized pipeline as Python code.\n\n\n\n\n\n\n\n\n\nfit(features, target, sample_weight=None, groups=None)\n\n\n\n\n\nRun the TPOT optimization process on the given training data.\n\n\nUses genetic programming to optimize a machine learning pipeline that maximizes the score on the provided features and target. This pipeline optimization procedure uses internal k-fold cross-validaton to avoid overfitting on the provided data. At the end of the pipeline optimization procedure, the best pipeline is then trained on the entire set of provided samples.\n\n\n\n\n\n\n\nParameters:\n\n\n\n\nfeatures\n: array-like {n_samples, n_features}\n\n\nFeature matrix\n\n\nTPOT and all scikit-learn algorithms assume that the features will be numerical and there will be no missing values.\nAs such, when a feature matrix is provided to TPOT, all missing values will automatically be replaced (i.e., imputed)\nusing \nmedian value imputation\n.\n\n\nIf you wish to use a different imputation strategy than median imputation, please make sure to apply imputation to your feature set prior to passing it to TPOT.\n\n\n\n\ntarget\n: array-like {n_samples}\n\n\nList of target labels for prediction\n\n\n\n\nsample_weight\n: array-like {n_samples}, optional\n\n\nPer-sample weights. Higher weights indicate more importance. If specified, sample_weight will be passed to any pipeline element whose fit() function accepts a sample_weight argument. By default, using sample_weight does not affect tpot's scoring functions, which determine preferences between pipelines.\n\n\n\n\ngroups\n: array-like, with shape {n_samples, }, optional\n\n\nGroup labels for the samples used when performing cross-validation.\n\n\nThis parameter should only be used in conjunction with sklearn's Group cross-validation functions, such as \nsklearn.model_selection.GroupKFold\n.\n\n\n\n\n\n\n\n\n\nReturns:\n\n\n\n\nself\n: object\n\n\nReturns a copy of the fitted TPOT object\n\n\n\n\n\n\n\n\n\n\n\n\n\n\npredict(features)\n\n\n\n\n\nUse the optimized pipeline to predict the target values for a feature set.\n\n\n\n\n\n\n\nParameters:\n\n\n\n\nfeatures\n: array-like {n_samples, n_features}\n\n\nFeature matrix\n\n\n\n\n\n\n\n\n\nReturns:\n\n\n\n\npredictions\n: array-like {n_samples}\n\n\nPredicted target values for the samples in the feature matrix\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nscore(testing_features, testing_target)\n\n\n\n\n\nReturns the optimized pipeline's score on the given testing data using the user-specified scoring function.\n\n\nThe default scoring function for TPOTClassifier is 'mean_squared_error'.\n\n\n\n\n\n\n\nParameters:\n\n\n\n\ntesting_features\n: array-like {n_samples, n_features}\n\n\nFeature matrix of the testing set\n\n\n\n\ntesting_target\n: array-like {n_samples}\n\n\nList of target labels for prediction in the testing set\n\n\n\n\n\n\n\n\n\nReturns:\n\n\n\n\naccuracy_score\n: float\n\n\nThe estimated test set accuracy according to the user-specified scoring function.\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nexport(output_file_name)\n\n\n\n\n\nExport the optimized pipeline as Python code.\n\n\nSee the \nusage documentation\n for example usage of the export function.\n\n\n\n\n\n\n\nParameters:\n\n\n\n\noutput_file_name\n: string\n\n\nString containing the path and file name of the desired output file\n\n\n\n\n\n\n\nReturns:\n\n\n\nDoes not return anything", + "text": "Classification\n\n\nclass\n tpot.\nTPOTClassifier\n(\ngenerations\n=100, \npopulation_size\n=100,\n \noffspring_size\n=None, \nmutation_rate\n=0.9,\n \ncrossover_rate\n=0.1,\n \nscoring\n='accuracy', \ncv\n=5,\n \nsubsample\n=1.0, \nn_jobs\n=1,\n \nmax_time_mins\n=None, \nmax_eval_time_mins\n=5,\n \nrandom_state\n=None, \nconfig_dict\n=None,\n \ntemplate\n=\"RandomTree\",\n \nwarm_start\n=False,\n \nmemory\n=None,\n \nuse_dask\n=False,\n \nperiodic_checkpoint_folder\n=None,\n \nearly_stop\n=None,\n \nverbosity\n=0,\n \ndisable_update_check\n=False\n)\n\n\n\nsource\n\n\n\nAutomated machine learning for supervised classification tasks.\n\n\nThe TPOTClassifier performs an intelligent search over machine learning pipelines that can contain supervised classification models,\npreprocessors, feature selection techniques, and any other estimator or transformer that follows the \nscikit-learn API\n.\nThe TPOTClassifier will also search over the hyperparameters of all objects in the pipeline.\n\n\nBy default, TPOTClassifier will search over a broad range of supervised classification algorithms, transformers, and their parameters.\nHowever, the algorithms, transformers, and hyperparameters that the TPOTClassifier searches over can be fully customized using the \nconfig_dict\n parameter.\n\n\nRead more in the \nUser Guide\n.\n\n\n\n\n\n\nParameters:\n\n\n\n\ngenerations\n: int, optional (default=100)\n\n\nNumber of iterations to the run pipeline optimization process. Must be a positive number.\n\n\nGenerally, TPOT will work better when you give it more generations (and therefore time) to optimize the pipeline.\n\n\nTPOT will evaluate \npopulation_size\n + \ngenerations\n \u00d7 \noffspring_size\n pipelines in total.\n\n\n\n\npopulation_size\n: int, optional (default=100)\n\n\nNumber of individuals to retain in the genetic programming population every generation. Must be a positive number.\n\n\nGenerally, TPOT will work better when you give it more individuals with which to optimize the pipeline.\n\n\n\n\noffspring_size\n: int, optional (default=None)\n\n\nNumber of offspring to produce in each genetic programming generation. Must be a positive number. By default, the number of offspring is equal to the number of population size.\n\n\n\n\nmutation_rate\n: float, optional (default=0.9)\n\n\nMutation rate for the genetic programming algorithm in the range [0.0, 1.0]. This parameter tells the GP algorithm how many pipelines to apply random changes to every generation.\n\n\n\nmutation_rate\n + \ncrossover_rate\n cannot exceed 1.0.\n\n\nWe recommend using the default parameter unless you understand how the mutation rate affects GP algorithms.\n\n\n\n\ncrossover_rate\n: float, optional (default=0.1)\n\n\nCrossover rate for the genetic programming algorithm in the range [0.0, 1.0]. This parameter tells the genetic programming algorithm how many pipelines to \"breed\" every generation.\n\n\n\nmutation_rate\n + \ncrossover_rate\n cannot exceed 1.0.\n\n\nWe recommend using the default parameter unless you understand how the crossover rate affects GP algorithms.\n\n\n\n\nscoring\n: string or callable, optional (default='accuracy')\n\n\nFunction used to evaluate the quality of a given pipeline for the classification problem. The following built-in scoring functions can be used:\n\n\n'accuracy', 'adjusted_rand_score', 'average_precision', 'balanced_accuracy', 'f1', 'f1_macro', 'f1_micro', 'f1_samples', 'f1_weighted', 'neg_log_loss','precision',\n'precision_macro', 'precision_micro', 'precision_samples', 'precision_weighted', 'recall', 'recall_macro', 'recall_micro', 'recall_samples', 'recall_weighted', 'roc_auc'\n\n\nIf you would like to use a custom scorer, you can pass the callable object/function with signature \nscorer(estimator, X, y)\n.\n\n\nIf you would like to use a metric function, you can pass the callable function to this parameter with the signature \nscore_func(y_true, y_pred)\n. TPOT assumes that any function with \"error\" or \"loss\" in the function name is meant to be minimized, whereas any other functions will be maximized. This scoring type was deprecated in version 0.9.1 and will be removed in version 0.11.\n\n\nSee the section on \nscoring functions\n for more details.\n\n\n\n\n\ncv\n: int, cross-validation generator, or an iterable, optional (default=5)\n\n\nCross-validation strategy used when evaluating pipelines.\n\n\nPossible inputs:\n\n\n\ninteger, to specify the number of folds in a StratifiedKFold,\n\n\nAn object to be used as a cross-validation generator, or\n\n\nAn iterable yielding train/test splits.\n\n\n\n\n\nsubsample\n: float, optional (default=1.0)\n\n\nFraction of training samples that are used during the TPOT optimization process. Must be in the range (0.0, 1.0].\n\n\nSetting \nsubsample\n=0.5 tells TPOT to use a random subsample of half of the training data. This subsample will remain the same during the entire pipeline optimization process.\n\n\n\n\nn_jobs\n: integer, optional (default=1)\n\n\nNumber of processes to use in parallel for evaluating pipelines during the TPOT optimization process.\n\n\nSetting \nn_jobs\n=-1 will use as many cores as available on the computer. For n_jobs below -1, (n_cpus + 1 + n_jobs) are used. Thus for n_jobs = -2, all CPUs but one are used. Beware that using multiple processes on the same machine may cause memory issues for large datasets.\n\n\n\n\nmax_time_mins\n: integer or None, optional (default=None)\n\n\nHow many minutes TPOT has to optimize the pipeline.\n\n\nIf not None, this setting will override the \ngenerations\n parameter and allow TPOT to run until \nmax_time_mins\n minutes elapse.\n\n\n\n\nmax_eval_time_mins\n: float, optional (default=5)\n\n\nHow many minutes TPOT has to evaluate a single pipeline.\n\n\nSetting this parameter to higher values will allow TPOT to evaluate more complex pipelines, but will also allow TPOT to run longer. Use this parameter to help prevent TPOT from wasting time on evaluating time-consuming pipelines.\n\n\n\n\nrandom_state\n: integer or None, optional (default=None)\n\n\nThe seed of the pseudo random number generator used in TPOT.\n\n\nUse this parameter to make sure that TPOT will give you the same results each time you run it against the same data set with that seed.\n\n\n\n\nconfig_dict\n: Python dictionary, string, or None, optional (default=None)\n\n\nA configuration dictionary for customizing the operators and parameters that TPOT searches in the optimization process.\n\n\nPossible inputs are:\n\n\n\nPython dictionary, TPOT will use your custom configuration,\n\n\nstring 'TPOT light', TPOT will use a built-in configuration with only fast models and preprocessors, or\n\n\nstring 'TPOT MDR', TPOT will use a built-in configuration specialized for genomic studies, or\n\n\nstring 'TPOT sparse': TPOT will use a configuration dictionary with a one-hot encoder and the operators normally included in TPOT that also support sparse matrices, or\n\n\nNone, TPOT will use the default TPOTClassifier configuration.\n\n\n\nSee the \nbuilt-in configurations\n section for the list of configurations included with TPOT, and the \ncustom configuration\n section for more information and examples of how to create your own TPOT configurations.\n\n\n\n\ntemplate\n: string (default=\"RandomTree\")\n\n\nTemplate of predefined pipeline structure. The option is for specifying a desired structure for the machine learning pipeline evaluated in TPOT.\n\n\nSo far this option only supports linear pipeline structure. Each step in the pipeline should be a main class of operators (Selector, Transformer, Classifier) or a specific operator (e.g. `SelectPercentile`) defined in TPOT operator configuration. If one step is a main class, TPOT will randomly assign all subclass operators (subclasses of [`SelectorMixin`](https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/feature_selection/base.py#L17), [`TransformerMixin`](https://scikit-learn.org/stable/modules/generated/sklearn.base.TransformerMixin.html), [`ClassifierMixin`](https://scikit-learn.org/stable/modules/generated/sklearn.base.ClassifierMixin.html) in scikit-learn) to that step. Steps in the template are delimited by \"-\", e.g. \"SelectPercentile-Transformer-Classifier\". By default value of template is \"RandomTree\", TPOT generates tree-based pipeline randomly.\n\nSee the \n template option in tpot\n section for more details.\n\n\n\n\nwarm_start\n: boolean, optional (default=False)\n\n\nFlag indicating whether the TPOT instance will reuse the population from previous calls to \nfit()\n.\n\n\nSetting \nwarm_start\n=True can be useful for running TPOT for a short time on a dataset, checking the results, then resuming the TPOT run from where it left off.\n\n\n\n\nmemory\n: a sklearn.external.joblib.Memory object or string, optional (default=None)\n\n\nIf supplied, pipeline will cache each transformer after calling fit. This feature is used to avoid computing the fit transformers within a pipeline if the parameters and input data are identical with another fitted pipeline during optimization process. More details about memory caching in \nscikit-learn documentation\n\n\n\nPossible inputs are:\n\n\n\nString 'auto': TPOT uses memory caching with a temporary directory and cleans it up upon shutdown, or\n\n\nPath of a caching directory, TPOT uses memory caching with the provided directory and TPOT does NOT clean the caching directory up upon shutdown, or\n\n\nMemory object, TPOT uses the instance of sklearn.external.joblib.Memory for memory caching and TPOT does NOT clean the caching directory up upon shutdown, or\n\n\nNone, TPOT does not use memory caching.\n\n\n\n\n\n\n\nuse_dask\n: boolean, optional (default: False)\n\n\nWhether to use Dask-ML's pipeline optimiziations. This avoid re-fitting\nthe same estimator on the same split of data multiple times. It\nwill also provide more detailed diagnostics when using Dask's\ndistributed scheduler.\n\n\nSee \navoid repeated work\n for more details.\n\n\n\n\nperiodic_checkpoint_folder\n: path string, optional (default: None)\n\n\nIf supplied, a folder in which TPOT will periodically save pipelines in pareto front so far while optimizing.\n\nCurrently once per generation but not more often than once per 30 seconds.\n\nUseful in multiple cases:\n\n\n\nSudden death before TPOT could save optimized pipeline\n\n\nTrack its progress\n\n\nGrab pipelines while it's still optimizing\n\n\n\n\n\n\n\nearly_stop\n: integer, optional (default: None)\n\n\nHow many generations TPOT checks whether there is no improvement in optimization process.\n\n\nEnds the optimization process if there is no improvement in the given number of generations.\n\n\n\n\nverbosity\n: integer, optional (default=0)\n\n\nHow much information TPOT communicates while it's running.\n\n\nPossible inputs are:\n\n\n\n0, TPOT will print nothing,\n\n\n1, TPOT will print minimal information,\n\n\n2, TPOT will print more information and provide a progress bar, or\n\n\n3, TPOT will print everything and provide a progress bar.\n\n\n\n\n\n\n\ndisable_update_check\n: boolean, optional (default=False)\n\n\nFlag indicating whether the TPOT version checker should be disabled.\n\n\nThe update checker will tell you when a new version of TPOT has been released.\n\n\n\n\n\n\n\n\n\n\nAttributes:\n\n\n\n\nfitted_pipeline_\n: scikit-learn Pipeline object\n\n\nThe best pipeline that TPOT discovered during the pipeline optimization process, fitted on the entire training dataset.\n\n\n\n\npareto_front_fitted_pipelines_\n: Python dictionary\n\n\nDictionary containing the all pipelines on the TPOT Pareto front, where the key is the string representation of the pipeline and the value is the corresponding pipeline fitted on the entire training dataset.\n\n\nThe TPOT Pareto front provides a trade-off between pipeline complexity (i.e., the number of steps in the pipeline) and the predictive performance of the pipeline.\n\n\nNote: \npareto_front_fitted_pipelines_\n is only available when \nverbosity\n=3.\n\n\n\n\nevaluated_individuals_\n: Python dictionary\n\n\nDictionary containing all pipelines that were evaluated during the pipeline optimization process, where the key is the string representation of the pipeline and the value is a tuple containing (# of steps in pipeline, accuracy metric for the pipeline).\n\n\nThis attribute is primarily for internal use, but may be useful for looking at the other pipelines that TPOT evaluated.\n\n\n\n\n\n\n\n\n\n\nExample\n\n\nfrom tpot import TPOTClassifier\nfrom sklearn.datasets import load_digits\nfrom sklearn.model_selection import train_test_split\n\ndigits = load_digits()\nX_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target,\n train_size=0.75, test_size=0.25)\n\ntpot = TPOTClassifier(generations=5, population_size=50, verbosity=2)\ntpot.fit(X_train, y_train)\nprint(tpot.score(X_test, y_test))\ntpot.export('tpot_mnist_pipeline.py')\n\n\n\n\nFunctions\n\n\n\n\n\n\nfit\n(features, classes[, sample_weight, groups])\n\n\nRun the TPOT optimization process on the given training data.\n\n\n\n\n\n\n\npredict\n(features)\n\n\nUse the optimized pipeline to predict the classes for a feature set.\n\n\n\n\n\n\n\npredict_proba\n(features)\n\n\nUse the optimized pipeline to estimate the class probabilities for a feature set.\n\n\n\n\n\n\n\nscore\n(testing_features, testing_classes)\n\n\nReturns the optimized pipeline's score on the given testing data using the user-specified scoring function.\n\n\n\n\n\n\n\nexport\n(output_file_name)\n\n\nExport the optimized pipeline as Python code.\n\n\n\n\n\n\n\n\n\nfit(features, classes, sample_weight=None, groups=None)\n\n\n\n\n\nRun the TPOT optimization process on the given training data.\n\n\nUses genetic programming to optimize a machine learning pipeline that maximizes the score on the provided features and target. This pipeline optimization procedure uses internal k-fold cross-validaton to avoid overfitting on the provided data. At the end of the pipeline optimization procedure, the best pipeline is then trained on the entire set of provided samples.\n\n\n\n\n\n\n\nParameters:\n\n\n\n\nfeatures\n: array-like {n_samples, n_features}\n\n\nFeature matrix\n\n\nTPOT and all scikit-learn algorithms assume that the features will be numerical and there will be no missing values.\nAs such, when a feature matrix is provided to TPOT, all missing values will automatically be replaced (i.e., imputed)\nusing \nmedian value imputation\n.\n\n\nIf you wish to use a different imputation strategy than median imputation, please make sure to apply imputation to your feature set prior to passing it to TPOT.\n\n\n\n\nclasses\n: array-like {n_samples}\n\n\nList of class labels for prediction\n\n\n\n\nsample_weight\n: array-like {n_samples}, optional\n\n\nPer-sample weights. Higher weights indicate more importance. If specified, sample_weight will be passed to any pipeline element whose fit() function accepts a sample_weight argument. By default, using sample_weight does not affect tpot's scoring functions, which determine preferences between pipelines.\n\n\n\n\ngroups\n: array-like, with shape {n_samples, }, optional\n\n\nGroup labels for the samples used when performing cross-validation.\n\n\nThis parameter should only be used in conjunction with sklearn's Group cross-validation functions, such as \nsklearn.model_selection.GroupKFold\n.\n\n\n\n\n\n\n\n\n\nReturns:\n\n\n\n\nself\n: object\n\n\nReturns a copy of the fitted TPOT object\n\n\n\n\n\n\n\n\n\n\n\n\n\n\npredict(features)\n\n\n\n\n\nUse the optimized pipeline to predict the classes for a feature set.\n\n\n\n\n\n\n\nParameters:\n\n\n\n\nfeatures\n: array-like {n_samples, n_features}\n\n\nFeature matrix\n\n\n\n\n\n\n\n\n\nReturns:\n\n\n\n\npredictions\n: array-like {n_samples}\n\n\nPredicted classes for the samples in the feature matrix\n\n\n\n\n\n\n\n\n\n\n\n\n\n\npredict_proba(features)\n\n\n\n\n\nUse the optimized pipeline to estimate the class probabilities for a feature set.\n\n\nNote: This function will only work for pipelines whose final classifier supports the \npredict_proba\n function. TPOT will raise an error otherwise.\n\n\n\n\n\n\n\nParameters:\n\n\n\n\nfeatures\n: array-like {n_samples, n_features}\n\n\nFeature matrix\n\n\n\n\n\n\n\n\n\nReturns:\n\n\n\n\npredictions\n: array-like {n_samples, n_classes}\n\n\nThe class probabilities of the input samples\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nscore(testing_features, testing_classes)\n\n\n\n\n\nReturns the optimized pipeline's score on the given testing data using the user-specified scoring function.\n\n\nThe default scoring function for TPOTClassifier is 'accuracy'.\n\n\n\n\n\n\n\nParameters:\n\n\n\n\ntesting_features\n: array-like {n_samples, n_features}\n\n\nFeature matrix of the testing set\n\n\n\n\ntesting_classes\n: array-like {n_samples}\n\n\nList of class labels for prediction in the testing set\n\n\n\n\n\n\n\n\n\nReturns:\n\n\n\n\naccuracy_score\n: float\n\n\nThe estimated test set accuracy according to the user-specified scoring function.\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nexport(output_file_name)\n\n\n\n\n\nExport the optimized pipeline as Python code.\n\n\nSee the \nusage documentation\n for example usage of the export function.\n\n\n\n\n\n\n\nParameters:\n\n\n\n\noutput_file_name\n: string\n\n\nString containing the path and file name of the desired output file\n\n\n\n\n\n\n\nReturns:\n\n\n\nDoes not return anything\n\n\n\n\n\n\n\n\n\n\nRegression\n\n\nclass\n tpot.\nTPOTRegressor\n(\ngenerations\n=100, \npopulation_size\n=100,\n \noffspring_size\n=None, \nmutation_rate\n=0.9,\n \ncrossover_rate\n=0.1,\n \nscoring\n='neg_mean_squared_error', \ncv\n=5,\n \nsubsample\n=1.0, \nn_jobs\n=1,\n \nmax_time_mins\n=None, \nmax_eval_time_mins\n=5,\n \nrandom_state\n=None, \nconfig_dict\n=None,\n \ntemplate\n=\"RandomTree\",\n \nwarm_start\n=False,\n \nmemory\n=None,\n \nuse_dask\n=False,\n \nperiodic_checkpoint_folder\n=None,\n \nearly_stop\n=None,\n \nverbosity\n=0,\n \ndisable_update_check\n=False\n)\n\n\n\nsource\n\n\n\nAutomated machine learning for supervised regression tasks.\n\n\nThe TPOTRegressor performs an intelligent search over machine learning pipelines that can contain supervised regression models,\npreprocessors, feature selection techniques, and any other estimator or transformer that follows the \nscikit-learn API\n.\nThe TPOTRegressor will also search over the hyperparameters of all objects in the pipeline.\n\n\nBy default, TPOTRegressor will search over a broad range of supervised regression models, transformers, and their hyperparameters.\nHowever, the models, transformers, and parameters that the TPOTRegressor searches over can be fully customized using the \nconfig_dict\n parameter.\n\n\nRead more in the \nUser Guide\n.\n\n\n\n\n\n\nParameters:\n\n\n\n\ngenerations\n: int, optional (default=100)\n\n\nNumber of iterations to the run pipeline optimization process. Must be a positive number.\n\n\nGenerally, TPOT will work better when you give it more generations (and therefore time) to optimize the pipeline.\n\n\nTPOT will evaluate \npopulation_size\n + \ngenerations\n \u00d7 \noffspring_size\n pipelines in total.\n\n\n\n\npopulation_size\n: int, optional (default=100)\n\n\nNumber of individuals to retain in the genetic programming population every generation. Must be a positive number.\n\n\nGenerally, TPOT will work better when you give it more individuals with which to optimize the pipeline.\n\n\n\n\noffspring_size\n: int, optional (default=None)\n\n\nNumber of offspring to produce in each genetic programming generation. Must be a positive number. By default, the number of offspring is equal to the number of population size.\n\n\n\n\nmutation_rate\n: float, optional (default=0.9)\n\n\nMutation rate for the genetic programming algorithm in the range [0.0, 1.0]. This parameter tells the GP algorithm how many pipelines to apply random changes to every generation.\n\n\n\nmutation_rate\n + \ncrossover_rate\n cannot exceed 1.0.\n\n\nWe recommend using the default parameter unless you understand how the mutation rate affects GP algorithms.\n\n\n\n\ncrossover_rate\n: float, optional (default=0.1)\n\n\nCrossover rate for the genetic programming algorithm in the range [0.0, 1.0]. This parameter tells the genetic programming algorithm how many pipelines to \"breed\" every generation.\n\n\n\nmutation_rate\n + \ncrossover_rate\n cannot exceed 1.0.\n\n\nWe recommend using the default parameter unless you understand how the crossover rate affects GP algorithms.\n\n\n\n\nscoring\n: string or callable, optional (default='neg_mean_squared_error')\n\n\nFunction used to evaluate the quality of a given pipeline for the regression problem. The following built-in scoring functions can be used:\n\n\n'neg_median_absolute_error', 'neg_mean_absolute_error', 'neg_mean_squared_error', 'r2'\n\n\nNote that we recommend using the \nneg\n version of mean squared error and related metrics so TPOT will minimize (instead of maximize) the metric.\n\n\nIf you would like to use a custom scorer, you can pass the callable object/function with signature \nscorer(estimator, X, y)\n.\n\n\nIf you would like to use a metric function, you can pass the callable function to this parameter with the signature \nscore_func(y_true, y_pred)\n. TPOT assumes that any function with \"error\" or \"loss\" in the function name is meant to be minimized, whereas any other functions will be maximized. This scoring type was deprecated in version 0.9.1 and will be removed in version 0.11.\n\n\nSee the section on \nscoring functions\n for more details.\n\n\n\n\ncv\n: int, cross-validation generator, or an iterable, optional (default=5)\n\n\nCross-validation strategy used when evaluating pipelines.\n\n\nPossible inputs:\n\n\n\ninteger, to specify the number of folds in a KFold,\n\n\nAn object to be used as a cross-validation generator, or\n\n\nAn iterable yielding train/test splits.\n\n\n\n\n\n\n\nsubsample\n: float, optional (default=1.0)\n\n\nFraction of training samples that are used during the TPOT optimization process. Must be in the range (0.0, 1.0].\n\n\nSetting \nsubsample\n=0.5 tells TPOT to use a random subsample of half of the training data. This subsample will remain the same during the entire pipeline optimization process.\n\n\n\n\nn_jobs\n: integer, optional (default=1)\n\n\nNumber of processes to use in parallel for evaluating pipelines during the TPOT optimization process.\n\n\nSetting \nn_jobs\n=-1 will use as many cores as available on the computer. For n_jobs below -1, (n_cpus + 1 + n_jobs) are used. Thus for n_jobs = -2, all CPUs but one are used. Beware that using multiple processes on the same machine may cause memory issues for large datasets\n\n\n\n\nmax_time_mins\n: integer or None, optional (default=None)\n\n\nHow many minutes TPOT has to optimize the pipeline.\n\n\nIf not None, this setting will override the \ngenerations\n parameter and allow TPOT to run until \nmax_time_mins\n minutes elapse.\n\n\n\n\nmax_eval_time_mins\n: float, optional (default=5)\n\n\nHow many minutes TPOT has to evaluate a single pipeline.\n\n\nSetting this parameter to higher values will allow TPOT to evaluate more complex pipelines, but will also allow TPOT to run longer. Use this parameter to help prevent TPOT from wasting time on evaluating time-consuming pipelines.\n\n\n\n\nrandom_state\n: integer or None, optional (default=None)\n\n\nThe seed of the pseudo random number generator used in TPOT.\n\n\nUse this parameter to make sure that TPOT will give you the same results each time you run it against the same data set with that seed.\n\n\n\n\nconfig_dict\n: Python dictionary, string, or None, optional (default=None)\n\n\nA configuration dictionary for customizing the operators and parameters that TPOT searches in the optimization process.\n\n\nPossible inputs are:\n\n\n\nPython dictionary, TPOT will use your custom configuration,\n\n\nstring 'TPOT light', TPOT will use a built-in configuration with only fast models and preprocessors, or\n\n\nstring 'TPOT MDR', TPOT will use a built-in configuration specialized for genomic studies, or\n\n\nstring 'TPOT sparse': TPOT will use a configuration dictionary with a one-hot encoder and the operators normally included in TPOT that also support sparse matrices, or\n\n\nNone, TPOT will use the default TPOTRegressor configuration.\n\n\n\nSee the \nbuilt-in configurations\n section for the list of configurations included with TPOT, and the \ncustom configuration\n section for more information and examples of how to create your own TPOT configurations.\n\n\n\n\ntemplate\n: string (default=\"RandomTree\")\n\n\nTemplate of predefined pipeline structure. The option is for specifying a desired structure for the machine learning pipeline evaluated in TPOT.\n\n\nSo far this option only supports linear pipeline structure. Each step in the pipeline should be a main class of operators (Selector, Transformer or Regressor) or a specific operator (e.g. `SelectPercentile`) defined in TPOT operator configuration. If one step is a main class, TPOT will randomly assign all subclass operators (subclasses of [`SelectorMixin`](https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/feature_selection/base.py#L17), [`TransformerMixin`](https://scikit-learn.org/stable/modules/generated/sklearn.base.TransformerMixin.html) or [`RegressorMixin`](https://scikit-learn.org/stable/modules/generated/sklearn.base.RegressorMixin.html) in scikit-learn) to that step. Steps in the template are delimited by \"-\", e.g. \"SelectPercentile-Transformer-Regressor\". By default value of template is \"RandomTree\", TPOT generates tree-based pipeline randomly.\n\nSee the \n template option in tpot\n section for more details.\n\n\n\n\nwarm_start\n: boolean, optional (default=False)\n\n\nFlag indicating whether the TPOT instance will reuse the population from previous calls to \nfit()\n.\n\n\nSetting \nwarm_start\n=True can be useful for running TPOT for a short time on a dataset, checking the results, then resuming the TPOT run from where it left off.\n\n\n\n\nmemory\n: a sklearn.external.joblib.Memory object or string, optional (default=None)\n\n\nIf supplied, pipeline will cache each transformer after calling fit. This feature is used to avoid computing the fit transformers within a pipeline if the parameters and input data are identical with another fitted pipeline during optimization process. More details about memory caching in \nscikit-learn documentation\n\n\n\nPossible inputs are:\n\n\n\nString 'auto': TPOT uses memory caching with a temporary directory and cleans it up upon shutdown, or\n\n\nPath of a caching directory, TPOT uses memory caching with the provided directory and TPOT does NOT clean the caching directory up upon shutdown, or\n\n\nMemory object, TPOT uses the instance of sklearn.external.joblib.Memory for memory caching and TPOT does NOT clean the caching directory up upon shutdown, or\n\n\nNone, TPOT does not use memory caching.\n\n\n\n\n\n\n\nuse_dask\n: boolean, optional (default: False)\n\n\nWhether to use Dask-ML's pipeline optimiziations. This avoid re-fitting\nthe same estimator on the same split of data multiple times. It\nwill also provide more detailed diagnostics when using Dask's\ndistributed scheduler.\n\n\nSee \navoid repeated work\n for more details.\n\n\n\n\nperiodic_checkpoint_folder\n: path string, optional (default: None)\n\n\nIf supplied, a folder in which TPOT will periodically save pipelines in pareto front so far while optimizing.\n\nCurrently once per generation but not more often than once per 30 seconds.\n\nUseful in multiple cases:\n\n\n\nSudden death before TPOT could save optimized pipeline\n\n\nTrack its progress\n\n\nGrab pipelines while it's still optimizing\n\n\n\n\n\n\n\nearly_stop\n: integer, optional (default: None)\n\n\nHow many generations TPOT checks whether there is no improvement in optimization process.\n\n\nEnds the optimization process if there is no improvement in the given number of generations.\n\n\n\n\nverbosity\n: integer, optional (default=0)\n\n\nHow much information TPOT communicates while it's running.\n\n\nPossible inputs are:\n\n\n\n0, TPOT will print nothing,\n\n\n1, TPOT will print minimal information,\n\n\n2, TPOT will print more information and provide a progress bar, or\n\n\n3, TPOT will print everything and provide a progress bar.\n\n\n\n\n\n\n\ndisable_update_check\n: boolean, optional (default=False)\n\n\nFlag indicating whether the TPOT version checker should be disabled.\n\n\nThe update checker will tell you when a new version of TPOT has been released.\n\n\n\n\n\n\n\n\n\n\nAttributes:\n\n\n\n\nfitted_pipeline_\n: scikit-learn Pipeline object\n\n\nThe best pipeline that TPOT discovered during the pipeline optimization process, fitted on the entire training dataset.\n\n\n\n\npareto_front_fitted_pipelines_\n: Python dictionary\n\n\nDictionary containing the all pipelines on the TPOT Pareto front, where the key is the string representation of the pipeline and the value is the corresponding pipeline fitted on the entire training dataset.\n\n\nThe TPOT Pareto front provides a trade-off between pipeline complexity (i.e., the number of steps in the pipeline) and the predictive performance of the pipeline.\n\n\nNote: \n_pareto_front_fitted_pipelines\n is only available when \nverbosity\n=3.\n\n\n\n\nevaluated_individuals_\n: Python dictionary\n\n\nDictionary containing all pipelines that were evaluated during the pipeline optimization process, where the key is the string representation of the pipeline and the value is a tuple containing (# of steps in pipeline, accuracy metric for the pipeline).\n\n\nThis attribute is primarily for internal use, but may be useful for looking at the other pipelines that TPOT evaluated.\n\n\n\n\n\n\n\n\n\n\nExample\n\n\nfrom tpot import TPOTRegressor\nfrom sklearn.datasets import load_boston\nfrom sklearn.model_selection import train_test_split\n\ndigits = load_boston()\nX_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target,\n train_size=0.75, test_size=0.25)\n\ntpot = TPOTRegressor(generations=5, population_size=50, verbosity=2)\ntpot.fit(X_train, y_train)\nprint(tpot.score(X_test, y_test))\ntpot.export('tpot_boston_pipeline.py')\n\n\n\n\nFunctions\n\n\n\n\n\n\nfit\n(features, target[, sample_weight, groups])\n\n\nRun the TPOT optimization process on the given training data.\n\n\n\n\n\n\n\npredict\n(features)\n\n\nUse the optimized pipeline to predict the target values for a feature set.\n\n\n\n\n\n\n\nscore\n(testing_features, testing_target)\n\n\nReturns the optimized pipeline's score on the given testing data using the user-specified scoring function.\n\n\n\n\n\n\n\nexport\n(output_file_name)\n\n\nExport the optimized pipeline as Python code.\n\n\n\n\n\n\n\n\n\nfit(features, target, sample_weight=None, groups=None)\n\n\n\n\n\nRun the TPOT optimization process on the given training data.\n\n\nUses genetic programming to optimize a machine learning pipeline that maximizes the score on the provided features and target. This pipeline optimization procedure uses internal k-fold cross-validaton to avoid overfitting on the provided data. At the end of the pipeline optimization procedure, the best pipeline is then trained on the entire set of provided samples.\n\n\n\n\n\n\n\nParameters:\n\n\n\n\nfeatures\n: array-like {n_samples, n_features}\n\n\nFeature matrix\n\n\nTPOT and all scikit-learn algorithms assume that the features will be numerical and there will be no missing values.\nAs such, when a feature matrix is provided to TPOT, all missing values will automatically be replaced (i.e., imputed)\nusing \nmedian value imputation\n.\n\n\nIf you wish to use a different imputation strategy than median imputation, please make sure to apply imputation to your feature set prior to passing it to TPOT.\n\n\n\n\ntarget\n: array-like {n_samples}\n\n\nList of target labels for prediction\n\n\n\n\nsample_weight\n: array-like {n_samples}, optional\n\n\nPer-sample weights. Higher weights indicate more importance. If specified, sample_weight will be passed to any pipeline element whose fit() function accepts a sample_weight argument. By default, using sample_weight does not affect tpot's scoring functions, which determine preferences between pipelines.\n\n\n\n\ngroups\n: array-like, with shape {n_samples, }, optional\n\n\nGroup labels for the samples used when performing cross-validation.\n\n\nThis parameter should only be used in conjunction with sklearn's Group cross-validation functions, such as \nsklearn.model_selection.GroupKFold\n.\n\n\n\n\n\n\n\n\n\nReturns:\n\n\n\n\nself\n: object\n\n\nReturns a copy of the fitted TPOT object\n\n\n\n\n\n\n\n\n\n\n\n\n\n\npredict(features)\n\n\n\n\n\nUse the optimized pipeline to predict the target values for a feature set.\n\n\n\n\n\n\n\nParameters:\n\n\n\n\nfeatures\n: array-like {n_samples, n_features}\n\n\nFeature matrix\n\n\n\n\n\n\n\n\n\nReturns:\n\n\n\n\npredictions\n: array-like {n_samples}\n\n\nPredicted target values for the samples in the feature matrix\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nscore(testing_features, testing_target)\n\n\n\n\n\nReturns the optimized pipeline's score on the given testing data using the user-specified scoring function.\n\n\nThe default scoring function for TPOTClassifier is 'mean_squared_error'.\n\n\n\n\n\n\n\nParameters:\n\n\n\n\ntesting_features\n: array-like {n_samples, n_features}\n\n\nFeature matrix of the testing set\n\n\n\n\ntesting_target\n: array-like {n_samples}\n\n\nList of target labels for prediction in the testing set\n\n\n\n\n\n\n\n\n\nReturns:\n\n\n\n\naccuracy_score\n: float\n\n\nThe estimated test set accuracy according to the user-specified scoring function.\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nexport(output_file_name)\n\n\n\n\n\nExport the optimized pipeline as Python code.\n\n\nSee the \nusage documentation\n for example usage of the export function.\n\n\n\n\n\n\n\nParameters:\n\n\n\n\noutput_file_name\n: string\n\n\nString containing the path and file name of the desired output file\n\n\n\n\n\n\n\nReturns:\n\n\n\nDoes not return anything", "title": "TPOT API" }, { "location": "/api/#classification", - "text": "class tpot. TPOTClassifier ( generations =100, population_size =100,\n offspring_size =None, mutation_rate =0.9,\n crossover_rate =0.1,\n scoring ='accuracy', cv =5,\n subsample =1.0, n_jobs =1,\n max_time_mins =None, max_eval_time_mins =5,\n random_state =None, config_dict =None,\n warm_start =False,\n memory =None,\n use_dask =False,\n periodic_checkpoint_folder =None,\n early_stop =None,\n verbosity =0,\n disable_update_check =False ) source Automated machine learning for supervised classification tasks. The TPOTClassifier performs an intelligent search over machine learning pipelines that can contain supervised classification models,\npreprocessors, feature selection techniques, and any other estimator or transformer that follows the scikit-learn API .\nThe TPOTClassifier will also search over the hyperparameters of all objects in the pipeline. By default, TPOTClassifier will search over a broad range of supervised classification algorithms, transformers, and their parameters.\nHowever, the algorithms, transformers, and hyperparameters that the TPOTClassifier searches over can be fully customized using the config_dict parameter. Read more in the User Guide . Parameters: generations : int, optional (default=100) \nNumber of iterations to the run pipeline optimization process. Must be a positive number. \nGenerally, TPOT will work better when you give it more generations (and therefore time) to optimize the pipeline. \nTPOT will evaluate population_size + generations \u00d7 offspring_size pipelines in total. population_size : int, optional (default=100) \nNumber of individuals to retain in the genetic programming population every generation. Must be a positive number. \nGenerally, TPOT will work better when you give it more individuals with which to optimize the pipeline. offspring_size : int, optional (default=None) \nNumber of offspring to produce in each genetic programming generation. Must be a positive number. By default, the number of offspring is equal to the number of population size. mutation_rate : float, optional (default=0.9) \nMutation rate for the genetic programming algorithm in the range [0.0, 1.0]. This parameter tells the GP algorithm how many pipelines to apply random changes to every generation. mutation_rate + crossover_rate cannot exceed 1.0. \nWe recommend using the default parameter unless you understand how the mutation rate affects GP algorithms. crossover_rate : float, optional (default=0.1) \nCrossover rate for the genetic programming algorithm in the range [0.0, 1.0]. This parameter tells the genetic programming algorithm how many pipelines to \"breed\" every generation. mutation_rate + crossover_rate cannot exceed 1.0. \nWe recommend using the default parameter unless you understand how the crossover rate affects GP algorithms. scoring : string or callable, optional (default='accuracy') \nFunction used to evaluate the quality of a given pipeline for the classification problem. The following built-in scoring functions can be used: \n'accuracy', 'adjusted_rand_score', 'average_precision', 'balanced_accuracy', 'f1', 'f1_macro', 'f1_micro', 'f1_samples', 'f1_weighted', 'neg_log_loss','precision',\n'precision_macro', 'precision_micro', 'precision_samples', 'precision_weighted', 'recall', 'recall_macro', 'recall_micro', 'recall_samples', 'recall_weighted', 'roc_auc' \nIf you would like to use a custom scorer, you can pass the callable object/function with signature scorer(estimator, X, y) . \nIf you would like to use a metric function, you can pass the callable function to this parameter with the signature score_func(y_true, y_pred) . TPOT assumes that any function with \"error\" or \"loss\" in the function name is meant to be minimized, whereas any other functions will be maximized. This scoring type was deprecated in version 0.9.1 and will be removed in version 0.11. \nSee the section on scoring functions for more details. cv : int, cross-validation generator, or an iterable, optional (default=5) \nCross-validation strategy used when evaluating pipelines. \nPossible inputs: integer, to specify the number of folds in a StratifiedKFold, An object to be used as a cross-validation generator, or An iterable yielding train/test splits. subsample : float, optional (default=1.0) \nFraction of training samples that are used during the TPOT optimization process. Must be in the range (0.0, 1.0]. \nSetting subsample =0.5 tells TPOT to use a random subsample of half of the training data. This subsample will remain the same during the entire pipeline optimization process. n_jobs : integer, optional (default=1) \nNumber of processes to use in parallel for evaluating pipelines during the TPOT optimization process. \nSetting n_jobs =-1 will use as many cores as available on the computer. Beware that using multiple processes on the same machine may cause memory issues for large datasets max_time_mins : integer or None, optional (default=None) \nHow many minutes TPOT has to optimize the pipeline. \nIf not None, this setting will override the generations parameter and allow TPOT to run until max_time_mins minutes elapse. max_eval_time_mins : float, optional (default=5) \nHow many minutes TPOT has to evaluate a single pipeline. \nSetting this parameter to higher values will allow TPOT to evaluate more complex pipelines, but will also allow TPOT to run longer. Use this parameter to help prevent TPOT from wasting time on evaluating time-consuming pipelines. random_state : integer or None, optional (default=None) \nThe seed of the pseudo random number generator used in TPOT. \nUse this parameter to make sure that TPOT will give you the same results each time you run it against the same data set with that seed. config_dict : Python dictionary, string, or None, optional (default=None) \nA configuration dictionary for customizing the operators and parameters that TPOT searches in the optimization process. \nPossible inputs are: Python dictionary, TPOT will use your custom configuration, string 'TPOT light', TPOT will use a built-in configuration with only fast models and preprocessors, or string 'TPOT MDR', TPOT will use a built-in configuration specialized for genomic studies, or string 'TPOT sparse': TPOT will use a configuration dictionary with a one-hot encoder and the operators normally included in TPOT that also support sparse matrices, or None, TPOT will use the default TPOTClassifier configuration. \nSee the built-in configurations section for the list of configurations included with TPOT, and the custom configuration section for more information and examples of how to create your own TPOT configurations. warm_start : boolean, optional (default=False) \nFlag indicating whether the TPOT instance will reuse the population from previous calls to fit() . \nSetting warm_start =True can be useful for running TPOT for a short time on a dataset, checking the results, then resuming the TPOT run from where it left off. memory : a sklearn.external.joblib.Memory object or string, optional (default=None) \nIf supplied, pipeline will cache each transformer after calling fit. This feature is used to avoid computing the fit transformers within a pipeline if the parameters and input data are identical with another fitted pipeline during optimization process. More details about memory caching in scikit-learn documentation \nPossible inputs are: String 'auto': TPOT uses memory caching with a temporary directory and cleans it up upon shutdown, or Path of a caching directory, TPOT uses memory caching with the provided directory and TPOT does NOT clean the caching directory up upon shutdown, or Memory object, TPOT uses the instance of sklearn.external.joblib.Memory for memory caching and TPOT does NOT clean the caching directory up upon shutdown, or None, TPOT does not use memory caching. use_dask : boolean, optional (default: False) \nWhether to use Dask-ML's pipeline optimiziations. This avoid re-fitting\nthe same estimator on the same split of data multiple times. It\nwill also provide more detailed diagnostics when using Dask's\ndistributed scheduler. \nSee avoid repeated work for more details. periodic_checkpoint_folder : path string, optional (default: None) \nIf supplied, a folder in which TPOT will periodically save pipelines in pareto front so far while optimizing. \nCurrently once per generation but not more often than once per 30 seconds. \nUseful in multiple cases: Sudden death before TPOT could save optimized pipeline Track its progress Grab pipelines while it's still optimizing early_stop : integer, optional (default: None) \nHow many generations TPOT checks whether there is no improvement in optimization process. \nEnds the optimization process if there is no improvement in the given number of generations. verbosity : integer, optional (default=0) \nHow much information TPOT communicates while it's running. \nPossible inputs are: 0, TPOT will print nothing, 1, TPOT will print minimal information, 2, TPOT will print more information and provide a progress bar, or 3, TPOT will print everything and provide a progress bar. disable_update_check : boolean, optional (default=False) \nFlag indicating whether the TPOT version checker should be disabled. \nThe update checker will tell you when a new version of TPOT has been released. Attributes: fitted_pipeline_ : scikit-learn Pipeline object \nThe best pipeline that TPOT discovered during the pipeline optimization process, fitted on the entire training dataset. pareto_front_fitted_pipelines_ : Python dictionary \nDictionary containing the all pipelines on the TPOT Pareto front, where the key is the string representation of the pipeline and the value is the corresponding pipeline fitted on the entire training dataset. \nThe TPOT Pareto front provides a trade-off between pipeline complexity (i.e., the number of steps in the pipeline) and the predictive performance of the pipeline. \nNote: pareto_front_fitted_pipelines_ is only available when verbosity =3. evaluated_individuals_ : Python dictionary \nDictionary containing all pipelines that were evaluated during the pipeline optimization process, where the key is the string representation of the pipeline and the value is a tuple containing (# of steps in pipeline, accuracy metric for the pipeline). \nThis attribute is primarily for internal use, but may be useful for looking at the other pipelines that TPOT evaluated. Example from tpot import TPOTClassifier\nfrom sklearn.datasets import load_digits\nfrom sklearn.model_selection import train_test_split\n\ndigits = load_digits()\nX_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target,\n train_size=0.75, test_size=0.25)\n\ntpot = TPOTClassifier(generations=5, population_size=50, verbosity=2)\ntpot.fit(X_train, y_train)\nprint(tpot.score(X_test, y_test))\ntpot.export('tpot_mnist_pipeline.py') Functions fit (features, classes[, sample_weight, groups]) Run the TPOT optimization process on the given training data. predict (features) Use the optimized pipeline to predict the classes for a feature set. predict_proba (features) Use the optimized pipeline to estimate the class probabilities for a feature set. score (testing_features, testing_classes) Returns the optimized pipeline's score on the given testing data using the user-specified scoring function. export (output_file_name) Export the optimized pipeline as Python code. fit(features, classes, sample_weight=None, groups=None) \nRun the TPOT optimization process on the given training data. \nUses genetic programming to optimize a machine learning pipeline that maximizes the score on the provided features and target. This pipeline optimization procedure uses internal k-fold cross-validaton to avoid overfitting on the provided data. At the end of the pipeline optimization procedure, the best pipeline is then trained on the entire set of provided samples. Parameters: features : array-like {n_samples, n_features} \nFeature matrix \nTPOT and all scikit-learn algorithms assume that the features will be numerical and there will be no missing values.\nAs such, when a feature matrix is provided to TPOT, all missing values will automatically be replaced (i.e., imputed)\nusing median value imputation . \nIf you wish to use a different imputation strategy than median imputation, please make sure to apply imputation to your feature set prior to passing it to TPOT. classes : array-like {n_samples} \nList of class labels for prediction sample_weight : array-like {n_samples}, optional \nPer-sample weights. Higher weights indicate more importance. If specified, sample_weight will be passed to any pipeline element whose fit() function accepts a sample_weight argument. By default, using sample_weight does not affect tpot's scoring functions, which determine preferences between pipelines. groups : array-like, with shape {n_samples, }, optional \nGroup labels for the samples used when performing cross-validation. \nThis parameter should only be used in conjunction with sklearn's Group cross-validation functions, such as sklearn.model_selection.GroupKFold . Returns: self : object \nReturns a copy of the fitted TPOT object predict(features) \nUse the optimized pipeline to predict the classes for a feature set. Parameters: features : array-like {n_samples, n_features} \nFeature matrix Returns: predictions : array-like {n_samples} \nPredicted classes for the samples in the feature matrix predict_proba(features) \nUse the optimized pipeline to estimate the class probabilities for a feature set. \nNote: This function will only work for pipelines whose final classifier supports the predict_proba function. TPOT will raise an error otherwise. Parameters: features : array-like {n_samples, n_features} \nFeature matrix Returns: predictions : array-like {n_samples, n_classes} \nThe class probabilities of the input samples score(testing_features, testing_classes) \nReturns the optimized pipeline's score on the given testing data using the user-specified scoring function. \nThe default scoring function for TPOTClassifier is 'accuracy'. Parameters: testing_features : array-like {n_samples, n_features} \nFeature matrix of the testing set testing_classes : array-like {n_samples} \nList of class labels for prediction in the testing set Returns: accuracy_score : float \nThe estimated test set accuracy according to the user-specified scoring function. export(output_file_name) \nExport the optimized pipeline as Python code. \nSee the usage documentation for example usage of the export function. Parameters: output_file_name : string \nString containing the path and file name of the desired output file Returns: \nDoes not return anything", + "text": "class tpot. TPOTClassifier ( generations =100, population_size =100,\n offspring_size =None, mutation_rate =0.9,\n crossover_rate =0.1,\n scoring ='accuracy', cv =5,\n subsample =1.0, n_jobs =1,\n max_time_mins =None, max_eval_time_mins =5,\n random_state =None, config_dict =None,\n template =\"RandomTree\",\n warm_start =False,\n memory =None,\n use_dask =False,\n periodic_checkpoint_folder =None,\n early_stop =None,\n verbosity =0,\n disable_update_check =False ) source Automated machine learning for supervised classification tasks. The TPOTClassifier performs an intelligent search over machine learning pipelines that can contain supervised classification models,\npreprocessors, feature selection techniques, and any other estimator or transformer that follows the scikit-learn API .\nThe TPOTClassifier will also search over the hyperparameters of all objects in the pipeline. By default, TPOTClassifier will search over a broad range of supervised classification algorithms, transformers, and their parameters.\nHowever, the algorithms, transformers, and hyperparameters that the TPOTClassifier searches over can be fully customized using the config_dict parameter. Read more in the User Guide . Parameters: generations : int, optional (default=100) \nNumber of iterations to the run pipeline optimization process. Must be a positive number. \nGenerally, TPOT will work better when you give it more generations (and therefore time) to optimize the pipeline. \nTPOT will evaluate population_size + generations \u00d7 offspring_size pipelines in total. population_size : int, optional (default=100) \nNumber of individuals to retain in the genetic programming population every generation. Must be a positive number. \nGenerally, TPOT will work better when you give it more individuals with which to optimize the pipeline. offspring_size : int, optional (default=None) \nNumber of offspring to produce in each genetic programming generation. Must be a positive number. By default, the number of offspring is equal to the number of population size. mutation_rate : float, optional (default=0.9) \nMutation rate for the genetic programming algorithm in the range [0.0, 1.0]. This parameter tells the GP algorithm how many pipelines to apply random changes to every generation. mutation_rate + crossover_rate cannot exceed 1.0. \nWe recommend using the default parameter unless you understand how the mutation rate affects GP algorithms. crossover_rate : float, optional (default=0.1) \nCrossover rate for the genetic programming algorithm in the range [0.0, 1.0]. This parameter tells the genetic programming algorithm how many pipelines to \"breed\" every generation. mutation_rate + crossover_rate cannot exceed 1.0. \nWe recommend using the default parameter unless you understand how the crossover rate affects GP algorithms. scoring : string or callable, optional (default='accuracy') \nFunction used to evaluate the quality of a given pipeline for the classification problem. The following built-in scoring functions can be used: \n'accuracy', 'adjusted_rand_score', 'average_precision', 'balanced_accuracy', 'f1', 'f1_macro', 'f1_micro', 'f1_samples', 'f1_weighted', 'neg_log_loss','precision',\n'precision_macro', 'precision_micro', 'precision_samples', 'precision_weighted', 'recall', 'recall_macro', 'recall_micro', 'recall_samples', 'recall_weighted', 'roc_auc' \nIf you would like to use a custom scorer, you can pass the callable object/function with signature scorer(estimator, X, y) . \nIf you would like to use a metric function, you can pass the callable function to this parameter with the signature score_func(y_true, y_pred) . TPOT assumes that any function with \"error\" or \"loss\" in the function name is meant to be minimized, whereas any other functions will be maximized. This scoring type was deprecated in version 0.9.1 and will be removed in version 0.11. \nSee the section on scoring functions for more details. cv : int, cross-validation generator, or an iterable, optional (default=5) \nCross-validation strategy used when evaluating pipelines. \nPossible inputs: integer, to specify the number of folds in a StratifiedKFold, An object to be used as a cross-validation generator, or An iterable yielding train/test splits. subsample : float, optional (default=1.0) \nFraction of training samples that are used during the TPOT optimization process. Must be in the range (0.0, 1.0]. \nSetting subsample =0.5 tells TPOT to use a random subsample of half of the training data. This subsample will remain the same during the entire pipeline optimization process. n_jobs : integer, optional (default=1) \nNumber of processes to use in parallel for evaluating pipelines during the TPOT optimization process. \nSetting n_jobs =-1 will use as many cores as available on the computer. For n_jobs below -1, (n_cpus + 1 + n_jobs) are used. Thus for n_jobs = -2, all CPUs but one are used. Beware that using multiple processes on the same machine may cause memory issues for large datasets. max_time_mins : integer or None, optional (default=None) \nHow many minutes TPOT has to optimize the pipeline. \nIf not None, this setting will override the generations parameter and allow TPOT to run until max_time_mins minutes elapse. max_eval_time_mins : float, optional (default=5) \nHow many minutes TPOT has to evaluate a single pipeline. \nSetting this parameter to higher values will allow TPOT to evaluate more complex pipelines, but will also allow TPOT to run longer. Use this parameter to help prevent TPOT from wasting time on evaluating time-consuming pipelines. random_state : integer or None, optional (default=None) \nThe seed of the pseudo random number generator used in TPOT. \nUse this parameter to make sure that TPOT will give you the same results each time you run it against the same data set with that seed. config_dict : Python dictionary, string, or None, optional (default=None) \nA configuration dictionary for customizing the operators and parameters that TPOT searches in the optimization process. \nPossible inputs are: Python dictionary, TPOT will use your custom configuration, string 'TPOT light', TPOT will use a built-in configuration with only fast models and preprocessors, or string 'TPOT MDR', TPOT will use a built-in configuration specialized for genomic studies, or string 'TPOT sparse': TPOT will use a configuration dictionary with a one-hot encoder and the operators normally included in TPOT that also support sparse matrices, or None, TPOT will use the default TPOTClassifier configuration. \nSee the built-in configurations section for the list of configurations included with TPOT, and the custom configuration section for more information and examples of how to create your own TPOT configurations. template : string (default=\"RandomTree\") \nTemplate of predefined pipeline structure. The option is for specifying a desired structure for the machine learning pipeline evaluated in TPOT. \nSo far this option only supports linear pipeline structure. Each step in the pipeline should be a main class of operators (Selector, Transformer, Classifier) or a specific operator (e.g. `SelectPercentile`) defined in TPOT operator configuration. If one step is a main class, TPOT will randomly assign all subclass operators (subclasses of [`SelectorMixin`](https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/feature_selection/base.py#L17), [`TransformerMixin`](https://scikit-learn.org/stable/modules/generated/sklearn.base.TransformerMixin.html), [`ClassifierMixin`](https://scikit-learn.org/stable/modules/generated/sklearn.base.ClassifierMixin.html) in scikit-learn) to that step. Steps in the template are delimited by \"-\", e.g. \"SelectPercentile-Transformer-Classifier\". By default value of template is \"RandomTree\", TPOT generates tree-based pipeline randomly.\n\nSee the template option in tpot section for more details. warm_start : boolean, optional (default=False) \nFlag indicating whether the TPOT instance will reuse the population from previous calls to fit() . \nSetting warm_start =True can be useful for running TPOT for a short time on a dataset, checking the results, then resuming the TPOT run from where it left off. memory : a sklearn.external.joblib.Memory object or string, optional (default=None) \nIf supplied, pipeline will cache each transformer after calling fit. This feature is used to avoid computing the fit transformers within a pipeline if the parameters and input data are identical with another fitted pipeline during optimization process. More details about memory caching in scikit-learn documentation \nPossible inputs are: String 'auto': TPOT uses memory caching with a temporary directory and cleans it up upon shutdown, or Path of a caching directory, TPOT uses memory caching with the provided directory and TPOT does NOT clean the caching directory up upon shutdown, or Memory object, TPOT uses the instance of sklearn.external.joblib.Memory for memory caching and TPOT does NOT clean the caching directory up upon shutdown, or None, TPOT does not use memory caching. use_dask : boolean, optional (default: False) \nWhether to use Dask-ML's pipeline optimiziations. This avoid re-fitting\nthe same estimator on the same split of data multiple times. It\nwill also provide more detailed diagnostics when using Dask's\ndistributed scheduler. \nSee avoid repeated work for more details. periodic_checkpoint_folder : path string, optional (default: None) \nIf supplied, a folder in which TPOT will periodically save pipelines in pareto front so far while optimizing. \nCurrently once per generation but not more often than once per 30 seconds. \nUseful in multiple cases: Sudden death before TPOT could save optimized pipeline Track its progress Grab pipelines while it's still optimizing early_stop : integer, optional (default: None) \nHow many generations TPOT checks whether there is no improvement in optimization process. \nEnds the optimization process if there is no improvement in the given number of generations. verbosity : integer, optional (default=0) \nHow much information TPOT communicates while it's running. \nPossible inputs are: 0, TPOT will print nothing, 1, TPOT will print minimal information, 2, TPOT will print more information and provide a progress bar, or 3, TPOT will print everything and provide a progress bar. disable_update_check : boolean, optional (default=False) \nFlag indicating whether the TPOT version checker should be disabled. \nThe update checker will tell you when a new version of TPOT has been released. Attributes: fitted_pipeline_ : scikit-learn Pipeline object \nThe best pipeline that TPOT discovered during the pipeline optimization process, fitted on the entire training dataset. pareto_front_fitted_pipelines_ : Python dictionary \nDictionary containing the all pipelines on the TPOT Pareto front, where the key is the string representation of the pipeline and the value is the corresponding pipeline fitted on the entire training dataset. \nThe TPOT Pareto front provides a trade-off between pipeline complexity (i.e., the number of steps in the pipeline) and the predictive performance of the pipeline. \nNote: pareto_front_fitted_pipelines_ is only available when verbosity =3. evaluated_individuals_ : Python dictionary \nDictionary containing all pipelines that were evaluated during the pipeline optimization process, where the key is the string representation of the pipeline and the value is a tuple containing (# of steps in pipeline, accuracy metric for the pipeline). \nThis attribute is primarily for internal use, but may be useful for looking at the other pipelines that TPOT evaluated. Example from tpot import TPOTClassifier\nfrom sklearn.datasets import load_digits\nfrom sklearn.model_selection import train_test_split\n\ndigits = load_digits()\nX_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target,\n train_size=0.75, test_size=0.25)\n\ntpot = TPOTClassifier(generations=5, population_size=50, verbosity=2)\ntpot.fit(X_train, y_train)\nprint(tpot.score(X_test, y_test))\ntpot.export('tpot_mnist_pipeline.py') Functions fit (features, classes[, sample_weight, groups]) Run the TPOT optimization process on the given training data. predict (features) Use the optimized pipeline to predict the classes for a feature set. predict_proba (features) Use the optimized pipeline to estimate the class probabilities for a feature set. score (testing_features, testing_classes) Returns the optimized pipeline's score on the given testing data using the user-specified scoring function. export (output_file_name) Export the optimized pipeline as Python code. fit(features, classes, sample_weight=None, groups=None) \nRun the TPOT optimization process on the given training data. \nUses genetic programming to optimize a machine learning pipeline that maximizes the score on the provided features and target. This pipeline optimization procedure uses internal k-fold cross-validaton to avoid overfitting on the provided data. At the end of the pipeline optimization procedure, the best pipeline is then trained on the entire set of provided samples. Parameters: features : array-like {n_samples, n_features} \nFeature matrix \nTPOT and all scikit-learn algorithms assume that the features will be numerical and there will be no missing values.\nAs such, when a feature matrix is provided to TPOT, all missing values will automatically be replaced (i.e., imputed)\nusing median value imputation . \nIf you wish to use a different imputation strategy than median imputation, please make sure to apply imputation to your feature set prior to passing it to TPOT. classes : array-like {n_samples} \nList of class labels for prediction sample_weight : array-like {n_samples}, optional \nPer-sample weights. Higher weights indicate more importance. If specified, sample_weight will be passed to any pipeline element whose fit() function accepts a sample_weight argument. By default, using sample_weight does not affect tpot's scoring functions, which determine preferences between pipelines. groups : array-like, with shape {n_samples, }, optional \nGroup labels for the samples used when performing cross-validation. \nThis parameter should only be used in conjunction with sklearn's Group cross-validation functions, such as sklearn.model_selection.GroupKFold . Returns: self : object \nReturns a copy of the fitted TPOT object predict(features) \nUse the optimized pipeline to predict the classes for a feature set. Parameters: features : array-like {n_samples, n_features} \nFeature matrix Returns: predictions : array-like {n_samples} \nPredicted classes for the samples in the feature matrix predict_proba(features) \nUse the optimized pipeline to estimate the class probabilities for a feature set. \nNote: This function will only work for pipelines whose final classifier supports the predict_proba function. TPOT will raise an error otherwise. Parameters: features : array-like {n_samples, n_features} \nFeature matrix Returns: predictions : array-like {n_samples, n_classes} \nThe class probabilities of the input samples score(testing_features, testing_classes) \nReturns the optimized pipeline's score on the given testing data using the user-specified scoring function. \nThe default scoring function for TPOTClassifier is 'accuracy'. Parameters: testing_features : array-like {n_samples, n_features} \nFeature matrix of the testing set testing_classes : array-like {n_samples} \nList of class labels for prediction in the testing set Returns: accuracy_score : float \nThe estimated test set accuracy according to the user-specified scoring function. export(output_file_name) \nExport the optimized pipeline as Python code. \nSee the usage documentation for example usage of the export function. Parameters: output_file_name : string \nString containing the path and file name of the desired output file Returns: \nDoes not return anything", "title": "Classification" }, { "location": "/api/#regression", - "text": "class tpot. TPOTRegressor ( generations =100, population_size =100,\n offspring_size =None, mutation_rate =0.9,\n crossover_rate =0.1,\n scoring ='neg_mean_squared_error', cv =5,\n subsample =1.0, n_jobs =1,\n max_time_mins =None, max_eval_time_mins =5,\n random_state =None, config_dict =None,\n warm_start =False,\n memory =None,\n use_dask =False,\n periodic_checkpoint_folder =None,\n early_stop =None,\n verbosity =0,\n disable_update_check =False ) source Automated machine learning for supervised regression tasks. The TPOTRegressor performs an intelligent search over machine learning pipelines that can contain supervised regression models,\npreprocessors, feature selection techniques, and any other estimator or transformer that follows the scikit-learn API .\nThe TPOTRegressor will also search over the hyperparameters of all objects in the pipeline. By default, TPOTRegressor will search over a broad range of supervised regression models, transformers, and their hyperparameters.\nHowever, the models, transformers, and parameters that the TPOTRegressor searches over can be fully customized using the config_dict parameter. Read more in the User Guide . Parameters: generations : int, optional (default=100) \nNumber of iterations to the run pipeline optimization process. Must be a positive number. \nGenerally, TPOT will work better when you give it more generations (and therefore time) to optimize the pipeline. \nTPOT will evaluate population_size + generations \u00d7 offspring_size pipelines in total. population_size : int, optional (default=100) \nNumber of individuals to retain in the genetic programming population every generation. Must be a positive number. \nGenerally, TPOT will work better when you give it more individuals with which to optimize the pipeline. offspring_size : int, optional (default=None) \nNumber of offspring to produce in each genetic programming generation. Must be a positive number. By default, the number of offspring is equal to the number of population size. mutation_rate : float, optional (default=0.9) \nMutation rate for the genetic programming algorithm in the range [0.0, 1.0]. This parameter tells the GP algorithm how many pipelines to apply random changes to every generation. mutation_rate + crossover_rate cannot exceed 1.0. \nWe recommend using the default parameter unless you understand how the mutation rate affects GP algorithms. crossover_rate : float, optional (default=0.1) \nCrossover rate for the genetic programming algorithm in the range [0.0, 1.0]. This parameter tells the genetic programming algorithm how many pipelines to \"breed\" every generation. mutation_rate + crossover_rate cannot exceed 1.0. \nWe recommend using the default parameter unless you understand how the crossover rate affects GP algorithms. scoring : string or callable, optional (default='neg_mean_squared_error') \nFunction used to evaluate the quality of a given pipeline for the regression problem. The following built-in scoring functions can be used: \n'neg_median_absolute_error', 'neg_mean_absolute_error', 'neg_mean_squared_error', 'r2' \nNote that we recommend using the neg version of mean squared error and related metrics so TPOT will minimize (instead of maximize) the metric. \nIf you would like to use a custom scorer, you can pass the callable object/function with signature scorer(estimator, X, y) . \nIf you would like to use a metric function, you can pass the callable function to this parameter with the signature score_func(y_true, y_pred) . TPOT assumes that any function with \"error\" or \"loss\" in the function name is meant to be minimized, whereas any other functions will be maximized. This scoring type was deprecated in version 0.9.1 and will be removed in version 0.11. \nSee the section on scoring functions for more details. cv : int, cross-validation generator, or an iterable, optional (default=5) \nCross-validation strategy used when evaluating pipelines. \nPossible inputs: integer, to specify the number of folds in a KFold, An object to be used as a cross-validation generator, or An iterable yielding train/test splits. subsample : float, optional (default=1.0) \nFraction of training samples that are used during the TPOT optimization process. Must be in the range (0.0, 1.0]. \nSetting subsample =0.5 tells TPOT to use a random subsample of half of the training data. This subsample will remain the same during the entire pipeline optimization process. n_jobs : integer, optional (default=1) \nNumber of processes to use in parallel for evaluating pipelines during the TPOT optimization process. \nSetting n_jobs =-1 will use as many cores as available on the computer. Beware that using multiple processes on the same machine may cause memory issues for large datasets max_time_mins : integer or None, optional (default=None) \nHow many minutes TPOT has to optimize the pipeline. \nIf not None, this setting will override the generations parameter and allow TPOT to run until max_time_mins minutes elapse. max_eval_time_mins : float, optional (default=5) \nHow many minutes TPOT has to evaluate a single pipeline. \nSetting this parameter to higher values will allow TPOT to evaluate more complex pipelines, but will also allow TPOT to run longer. Use this parameter to help prevent TPOT from wasting time on evaluating time-consuming pipelines. random_state : integer or None, optional (default=None) \nThe seed of the pseudo random number generator used in TPOT. \nUse this parameter to make sure that TPOT will give you the same results each time you run it against the same data set with that seed. config_dict : Python dictionary, string, or None, optional (default=None) \nA configuration dictionary for customizing the operators and parameters that TPOT searches in the optimization process. \nPossible inputs are: Python dictionary, TPOT will use your custom configuration, string 'TPOT light', TPOT will use a built-in configuration with only fast models and preprocessors, or string 'TPOT MDR', TPOT will use a built-in configuration specialized for genomic studies, or string 'TPOT sparse': TPOT will use a configuration dictionary with a one-hot encoder and the operators normally included in TPOT that also support sparse matrices, or None, TPOT will use the default TPOTRegressor configuration. \nSee the built-in configurations section for the list of configurations included with TPOT, and the custom configuration section for more information and examples of how to create your own TPOT configurations. warm_start : boolean, optional (default=False) \nFlag indicating whether the TPOT instance will reuse the population from previous calls to fit() . \nSetting warm_start =True can be useful for running TPOT for a short time on a dataset, checking the results, then resuming the TPOT run from where it left off. memory : a sklearn.external.joblib.Memory object or string, optional (default=None) \nIf supplied, pipeline will cache each transformer after calling fit. This feature is used to avoid computing the fit transformers within a pipeline if the parameters and input data are identical with another fitted pipeline during optimization process. More details about memory caching in scikit-learn documentation \nPossible inputs are: String 'auto': TPOT uses memory caching with a temporary directory and cleans it up upon shutdown, or Path of a caching directory, TPOT uses memory caching with the provided directory and TPOT does NOT clean the caching directory up upon shutdown, or Memory object, TPOT uses the instance of sklearn.external.joblib.Memory for memory caching and TPOT does NOT clean the caching directory up upon shutdown, or None, TPOT does not use memory caching. use_dask : boolean, optional (default: False) \nWhether to use Dask-ML's pipeline optimiziations. This avoid re-fitting\nthe same estimator on the same split of data multiple times. It\nwill also provide more detailed diagnostics when using Dask's\ndistributed scheduler. \nSee avoid repeated work for more details. periodic_checkpoint_folder : path string, optional (default: None) \nIf supplied, a folder in which TPOT will periodically save pipelines in pareto front so far while optimizing. \nCurrently once per generation but not more often than once per 30 seconds. \nUseful in multiple cases: Sudden death before TPOT could save optimized pipeline Track its progress Grab pipelines while it's still optimizing early_stop : integer, optional (default: None) \nHow many generations TPOT checks whether there is no improvement in optimization process. \nEnds the optimization process if there is no improvement in the given number of generations. verbosity : integer, optional (default=0) \nHow much information TPOT communicates while it's running. \nPossible inputs are: 0, TPOT will print nothing, 1, TPOT will print minimal information, 2, TPOT will print more information and provide a progress bar, or 3, TPOT will print everything and provide a progress bar. disable_update_check : boolean, optional (default=False) \nFlag indicating whether the TPOT version checker should be disabled. \nThe update checker will tell you when a new version of TPOT has been released. Attributes: fitted_pipeline_ : scikit-learn Pipeline object \nThe best pipeline that TPOT discovered during the pipeline optimization process, fitted on the entire training dataset. pareto_front_fitted_pipelines_ : Python dictionary \nDictionary containing the all pipelines on the TPOT Pareto front, where the key is the string representation of the pipeline and the value is the corresponding pipeline fitted on the entire training dataset. \nThe TPOT Pareto front provides a trade-off between pipeline complexity (i.e., the number of steps in the pipeline) and the predictive performance of the pipeline. \nNote: _pareto_front_fitted_pipelines is only available when verbosity =3. evaluated_individuals_ : Python dictionary \nDictionary containing all pipelines that were evaluated during the pipeline optimization process, where the key is the string representation of the pipeline and the value is a tuple containing (# of steps in pipeline, accuracy metric for the pipeline). \nThis attribute is primarily for internal use, but may be useful for looking at the other pipelines that TPOT evaluated. Example from tpot import TPOTRegressor\nfrom sklearn.datasets import load_boston\nfrom sklearn.model_selection import train_test_split\n\ndigits = load_boston()\nX_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target,\n train_size=0.75, test_size=0.25)\n\ntpot = TPOTRegressor(generations=5, population_size=50, verbosity=2)\ntpot.fit(X_train, y_train)\nprint(tpot.score(X_test, y_test))\ntpot.export('tpot_boston_pipeline.py') Functions fit (features, target[, sample_weight, groups]) Run the TPOT optimization process on the given training data. predict (features) Use the optimized pipeline to predict the target values for a feature set. score (testing_features, testing_target) Returns the optimized pipeline's score on the given testing data using the user-specified scoring function. export (output_file_name) Export the optimized pipeline as Python code. fit(features, target, sample_weight=None, groups=None) \nRun the TPOT optimization process on the given training data. \nUses genetic programming to optimize a machine learning pipeline that maximizes the score on the provided features and target. This pipeline optimization procedure uses internal k-fold cross-validaton to avoid overfitting on the provided data. At the end of the pipeline optimization procedure, the best pipeline is then trained on the entire set of provided samples. Parameters: features : array-like {n_samples, n_features} \nFeature matrix \nTPOT and all scikit-learn algorithms assume that the features will be numerical and there will be no missing values.\nAs such, when a feature matrix is provided to TPOT, all missing values will automatically be replaced (i.e., imputed)\nusing median value imputation . \nIf you wish to use a different imputation strategy than median imputation, please make sure to apply imputation to your feature set prior to passing it to TPOT. target : array-like {n_samples} \nList of target labels for prediction sample_weight : array-like {n_samples}, optional \nPer-sample weights. Higher weights indicate more importance. If specified, sample_weight will be passed to any pipeline element whose fit() function accepts a sample_weight argument. By default, using sample_weight does not affect tpot's scoring functions, which determine preferences between pipelines. groups : array-like, with shape {n_samples, }, optional \nGroup labels for the samples used when performing cross-validation. \nThis parameter should only be used in conjunction with sklearn's Group cross-validation functions, such as sklearn.model_selection.GroupKFold . Returns: self : object \nReturns a copy of the fitted TPOT object predict(features) \nUse the optimized pipeline to predict the target values for a feature set. Parameters: features : array-like {n_samples, n_features} \nFeature matrix Returns: predictions : array-like {n_samples} \nPredicted target values for the samples in the feature matrix score(testing_features, testing_target) \nReturns the optimized pipeline's score on the given testing data using the user-specified scoring function. \nThe default scoring function for TPOTClassifier is 'mean_squared_error'. Parameters: testing_features : array-like {n_samples, n_features} \nFeature matrix of the testing set testing_target : array-like {n_samples} \nList of target labels for prediction in the testing set Returns: accuracy_score : float \nThe estimated test set accuracy according to the user-specified scoring function. export(output_file_name) \nExport the optimized pipeline as Python code. \nSee the usage documentation for example usage of the export function. Parameters: output_file_name : string \nString containing the path and file name of the desired output file Returns: \nDoes not return anything", + "text": "class tpot. TPOTRegressor ( generations =100, population_size =100,\n offspring_size =None, mutation_rate =0.9,\n crossover_rate =0.1,\n scoring ='neg_mean_squared_error', cv =5,\n subsample =1.0, n_jobs =1,\n max_time_mins =None, max_eval_time_mins =5,\n random_state =None, config_dict =None,\n template =\"RandomTree\",\n warm_start =False,\n memory =None,\n use_dask =False,\n periodic_checkpoint_folder =None,\n early_stop =None,\n verbosity =0,\n disable_update_check =False ) source Automated machine learning for supervised regression tasks. The TPOTRegressor performs an intelligent search over machine learning pipelines that can contain supervised regression models,\npreprocessors, feature selection techniques, and any other estimator or transformer that follows the scikit-learn API .\nThe TPOTRegressor will also search over the hyperparameters of all objects in the pipeline. By default, TPOTRegressor will search over a broad range of supervised regression models, transformers, and their hyperparameters.\nHowever, the models, transformers, and parameters that the TPOTRegressor searches over can be fully customized using the config_dict parameter. Read more in the User Guide . Parameters: generations : int, optional (default=100) \nNumber of iterations to the run pipeline optimization process. Must be a positive number. \nGenerally, TPOT will work better when you give it more generations (and therefore time) to optimize the pipeline. \nTPOT will evaluate population_size + generations \u00d7 offspring_size pipelines in total. population_size : int, optional (default=100) \nNumber of individuals to retain in the genetic programming population every generation. Must be a positive number. \nGenerally, TPOT will work better when you give it more individuals with which to optimize the pipeline. offspring_size : int, optional (default=None) \nNumber of offspring to produce in each genetic programming generation. Must be a positive number. By default, the number of offspring is equal to the number of population size. mutation_rate : float, optional (default=0.9) \nMutation rate for the genetic programming algorithm in the range [0.0, 1.0]. This parameter tells the GP algorithm how many pipelines to apply random changes to every generation. mutation_rate + crossover_rate cannot exceed 1.0. \nWe recommend using the default parameter unless you understand how the mutation rate affects GP algorithms. crossover_rate : float, optional (default=0.1) \nCrossover rate for the genetic programming algorithm in the range [0.0, 1.0]. This parameter tells the genetic programming algorithm how many pipelines to \"breed\" every generation. mutation_rate + crossover_rate cannot exceed 1.0. \nWe recommend using the default parameter unless you understand how the crossover rate affects GP algorithms. scoring : string or callable, optional (default='neg_mean_squared_error') \nFunction used to evaluate the quality of a given pipeline for the regression problem. The following built-in scoring functions can be used: \n'neg_median_absolute_error', 'neg_mean_absolute_error', 'neg_mean_squared_error', 'r2' \nNote that we recommend using the neg version of mean squared error and related metrics so TPOT will minimize (instead of maximize) the metric. \nIf you would like to use a custom scorer, you can pass the callable object/function with signature scorer(estimator, X, y) . \nIf you would like to use a metric function, you can pass the callable function to this parameter with the signature score_func(y_true, y_pred) . TPOT assumes that any function with \"error\" or \"loss\" in the function name is meant to be minimized, whereas any other functions will be maximized. This scoring type was deprecated in version 0.9.1 and will be removed in version 0.11. \nSee the section on scoring functions for more details. cv : int, cross-validation generator, or an iterable, optional (default=5) \nCross-validation strategy used when evaluating pipelines. \nPossible inputs: integer, to specify the number of folds in a KFold, An object to be used as a cross-validation generator, or An iterable yielding train/test splits. subsample : float, optional (default=1.0) \nFraction of training samples that are used during the TPOT optimization process. Must be in the range (0.0, 1.0]. \nSetting subsample =0.5 tells TPOT to use a random subsample of half of the training data. This subsample will remain the same during the entire pipeline optimization process. n_jobs : integer, optional (default=1) \nNumber of processes to use in parallel for evaluating pipelines during the TPOT optimization process. \nSetting n_jobs =-1 will use as many cores as available on the computer. For n_jobs below -1, (n_cpus + 1 + n_jobs) are used. Thus for n_jobs = -2, all CPUs but one are used. Beware that using multiple processes on the same machine may cause memory issues for large datasets max_time_mins : integer or None, optional (default=None) \nHow many minutes TPOT has to optimize the pipeline. \nIf not None, this setting will override the generations parameter and allow TPOT to run until max_time_mins minutes elapse. max_eval_time_mins : float, optional (default=5) \nHow many minutes TPOT has to evaluate a single pipeline. \nSetting this parameter to higher values will allow TPOT to evaluate more complex pipelines, but will also allow TPOT to run longer. Use this parameter to help prevent TPOT from wasting time on evaluating time-consuming pipelines. random_state : integer or None, optional (default=None) \nThe seed of the pseudo random number generator used in TPOT. \nUse this parameter to make sure that TPOT will give you the same results each time you run it against the same data set with that seed. config_dict : Python dictionary, string, or None, optional (default=None) \nA configuration dictionary for customizing the operators and parameters that TPOT searches in the optimization process. \nPossible inputs are: Python dictionary, TPOT will use your custom configuration, string 'TPOT light', TPOT will use a built-in configuration with only fast models and preprocessors, or string 'TPOT MDR', TPOT will use a built-in configuration specialized for genomic studies, or string 'TPOT sparse': TPOT will use a configuration dictionary with a one-hot encoder and the operators normally included in TPOT that also support sparse matrices, or None, TPOT will use the default TPOTRegressor configuration. \nSee the built-in configurations section for the list of configurations included with TPOT, and the custom configuration section for more information and examples of how to create your own TPOT configurations. template : string (default=\"RandomTree\") \nTemplate of predefined pipeline structure. The option is for specifying a desired structure for the machine learning pipeline evaluated in TPOT. \nSo far this option only supports linear pipeline structure. Each step in the pipeline should be a main class of operators (Selector, Transformer or Regressor) or a specific operator (e.g. `SelectPercentile`) defined in TPOT operator configuration. If one step is a main class, TPOT will randomly assign all subclass operators (subclasses of [`SelectorMixin`](https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/feature_selection/base.py#L17), [`TransformerMixin`](https://scikit-learn.org/stable/modules/generated/sklearn.base.TransformerMixin.html) or [`RegressorMixin`](https://scikit-learn.org/stable/modules/generated/sklearn.base.RegressorMixin.html) in scikit-learn) to that step. Steps in the template are delimited by \"-\", e.g. \"SelectPercentile-Transformer-Regressor\". By default value of template is \"RandomTree\", TPOT generates tree-based pipeline randomly.\n\nSee the template option in tpot section for more details. warm_start : boolean, optional (default=False) \nFlag indicating whether the TPOT instance will reuse the population from previous calls to fit() . \nSetting warm_start =True can be useful for running TPOT for a short time on a dataset, checking the results, then resuming the TPOT run from where it left off. memory : a sklearn.external.joblib.Memory object or string, optional (default=None) \nIf supplied, pipeline will cache each transformer after calling fit. This feature is used to avoid computing the fit transformers within a pipeline if the parameters and input data are identical with another fitted pipeline during optimization process. More details about memory caching in scikit-learn documentation \nPossible inputs are: String 'auto': TPOT uses memory caching with a temporary directory and cleans it up upon shutdown, or Path of a caching directory, TPOT uses memory caching with the provided directory and TPOT does NOT clean the caching directory up upon shutdown, or Memory object, TPOT uses the instance of sklearn.external.joblib.Memory for memory caching and TPOT does NOT clean the caching directory up upon shutdown, or None, TPOT does not use memory caching. use_dask : boolean, optional (default: False) \nWhether to use Dask-ML's pipeline optimiziations. This avoid re-fitting\nthe same estimator on the same split of data multiple times. It\nwill also provide more detailed diagnostics when using Dask's\ndistributed scheduler. \nSee avoid repeated work for more details. periodic_checkpoint_folder : path string, optional (default: None) \nIf supplied, a folder in which TPOT will periodically save pipelines in pareto front so far while optimizing. \nCurrently once per generation but not more often than once per 30 seconds. \nUseful in multiple cases: Sudden death before TPOT could save optimized pipeline Track its progress Grab pipelines while it's still optimizing early_stop : integer, optional (default: None) \nHow many generations TPOT checks whether there is no improvement in optimization process. \nEnds the optimization process if there is no improvement in the given number of generations. verbosity : integer, optional (default=0) \nHow much information TPOT communicates while it's running. \nPossible inputs are: 0, TPOT will print nothing, 1, TPOT will print minimal information, 2, TPOT will print more information and provide a progress bar, or 3, TPOT will print everything and provide a progress bar. disable_update_check : boolean, optional (default=False) \nFlag indicating whether the TPOT version checker should be disabled. \nThe update checker will tell you when a new version of TPOT has been released. Attributes: fitted_pipeline_ : scikit-learn Pipeline object \nThe best pipeline that TPOT discovered during the pipeline optimization process, fitted on the entire training dataset. pareto_front_fitted_pipelines_ : Python dictionary \nDictionary containing the all pipelines on the TPOT Pareto front, where the key is the string representation of the pipeline and the value is the corresponding pipeline fitted on the entire training dataset. \nThe TPOT Pareto front provides a trade-off between pipeline complexity (i.e., the number of steps in the pipeline) and the predictive performance of the pipeline. \nNote: _pareto_front_fitted_pipelines is only available when verbosity =3. evaluated_individuals_ : Python dictionary \nDictionary containing all pipelines that were evaluated during the pipeline optimization process, where the key is the string representation of the pipeline and the value is a tuple containing (# of steps in pipeline, accuracy metric for the pipeline). \nThis attribute is primarily for internal use, but may be useful for looking at the other pipelines that TPOT evaluated. Example from tpot import TPOTRegressor\nfrom sklearn.datasets import load_boston\nfrom sklearn.model_selection import train_test_split\n\ndigits = load_boston()\nX_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target,\n train_size=0.75, test_size=0.25)\n\ntpot = TPOTRegressor(generations=5, population_size=50, verbosity=2)\ntpot.fit(X_train, y_train)\nprint(tpot.score(X_test, y_test))\ntpot.export('tpot_boston_pipeline.py') Functions fit (features, target[, sample_weight, groups]) Run the TPOT optimization process on the given training data. predict (features) Use the optimized pipeline to predict the target values for a feature set. score (testing_features, testing_target) Returns the optimized pipeline's score on the given testing data using the user-specified scoring function. export (output_file_name) Export the optimized pipeline as Python code. fit(features, target, sample_weight=None, groups=None) \nRun the TPOT optimization process on the given training data. \nUses genetic programming to optimize a machine learning pipeline that maximizes the score on the provided features and target. This pipeline optimization procedure uses internal k-fold cross-validaton to avoid overfitting on the provided data. At the end of the pipeline optimization procedure, the best pipeline is then trained on the entire set of provided samples. Parameters: features : array-like {n_samples, n_features} \nFeature matrix \nTPOT and all scikit-learn algorithms assume that the features will be numerical and there will be no missing values.\nAs such, when a feature matrix is provided to TPOT, all missing values will automatically be replaced (i.e., imputed)\nusing median value imputation . \nIf you wish to use a different imputation strategy than median imputation, please make sure to apply imputation to your feature set prior to passing it to TPOT. target : array-like {n_samples} \nList of target labels for prediction sample_weight : array-like {n_samples}, optional \nPer-sample weights. Higher weights indicate more importance. If specified, sample_weight will be passed to any pipeline element whose fit() function accepts a sample_weight argument. By default, using sample_weight does not affect tpot's scoring functions, which determine preferences between pipelines. groups : array-like, with shape {n_samples, }, optional \nGroup labels for the samples used when performing cross-validation. \nThis parameter should only be used in conjunction with sklearn's Group cross-validation functions, such as sklearn.model_selection.GroupKFold . Returns: self : object \nReturns a copy of the fitted TPOT object predict(features) \nUse the optimized pipeline to predict the target values for a feature set. Parameters: features : array-like {n_samples, n_features} \nFeature matrix Returns: predictions : array-like {n_samples} \nPredicted target values for the samples in the feature matrix score(testing_features, testing_target) \nReturns the optimized pipeline's score on the given testing data using the user-specified scoring function. \nThe default scoring function for TPOTClassifier is 'mean_squared_error'. Parameters: testing_features : array-like {n_samples, n_features} \nFeature matrix of the testing set testing_target : array-like {n_samples} \nList of target labels for prediction in the testing set Returns: accuracy_score : float \nThe estimated test set accuracy according to the user-specified scoring function. export(output_file_name) \nExport the optimized pipeline as Python code. \nSee the usage documentation for example usage of the export function. Parameters: output_file_name : string \nString containing the path and file name of the desired output file Returns: \nDoes not return anything", "title": "Regression" }, { diff --git a/docs/sitemap.xml b/docs/sitemap.xml index 09f8842a..5b7788e7 100644 --- a/docs/sitemap.xml +++ b/docs/sitemap.xml @@ -4,7 +4,7 @@ http://epistasislab.github.io/tpot/ - 2019-03-01 + 2019-04-11 daily @@ -12,7 +12,7 @@ http://epistasislab.github.io/tpot/installing/ - 2019-03-01 + 2019-04-11 daily @@ -20,7 +20,7 @@ http://epistasislab.github.io/tpot/using/ - 2019-03-01 + 2019-04-11 daily @@ -28,7 +28,7 @@ http://epistasislab.github.io/tpot/api/ - 2019-03-01 + 2019-04-11 daily @@ -36,7 +36,7 @@ http://epistasislab.github.io/tpot/examples/ - 2019-03-01 + 2019-04-11 daily @@ -44,7 +44,7 @@ http://epistasislab.github.io/tpot/contributing/ - 2019-03-01 + 2019-04-11 daily @@ -52,7 +52,7 @@ http://epistasislab.github.io/tpot/releases/ - 2019-03-01 + 2019-04-11 daily @@ -60,7 +60,7 @@ http://epistasislab.github.io/tpot/citing/ - 2019-03-01 + 2019-04-11 daily @@ -68,7 +68,7 @@ http://epistasislab.github.io/tpot/support/ - 2019-03-01 + 2019-04-11 daily @@ -76,7 +76,7 @@ http://epistasislab.github.io/tpot/related/ - 2019-03-01 + 2019-04-11 daily diff --git a/docs/using/index.html b/docs/using/index.html index fd6ed07d..2f1642a4 100644 --- a/docs/using/index.html +++ b/docs/using/index.html @@ -80,6 +80,12 @@
  • Customizing TPOT's operators and parameters
  • +
  • Template option in TPOT
  • + + +
  • FeatureSetSelector in TPOT
  • + +
  • Pipeline caching in TPOT
  • @@ -367,7 +373,7 @@

    TPOT on the command line

    Any positive integer or -1 Number of CPUs for evaluating pipelines in parallel during the TPOT optimization process.

    -Assigning this to -1 will use as many cores as available on the computer. +Assigning this to -1 will use as many cores as available on the computer. For n_jobs below -1, (n_cpus + 1 + n_jobs) are used. Thus for n_jobs = -2, all CPUs but one are used. -maxtime @@ -409,6 +415,15 @@

    TPOT on the command line

    +-template +TEMPLATE +String +Template of predefined pipeline structure. The option is for specifying a desired structure for the machine learning pipeline evaluated in TPOT. So far this option only supports linear pipeline structure. Each step in the pipeline should be a main class of operators (Selector, Transformer, Classifier or Regressor) or a specific operator (e.g. `SelectPercentile`) defined in TPOT operator configuration. If one step is a main class, TPOT will randomly assign all subclass operators (subclasses of [`SelectorMixin`](https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/feature_selection/base.py#L17), [`TransformerMixin`](https://scikit-learn.org/stable/modules/generated/sklearn.base.TransformerMixin.html), [`ClassifierMixin`](https://scikit-learn.org/stable/modules/generated/sklearn.base.ClassifierMixin.html) or [`RegressorMixin`](https://scikit-learn.org/stable/modules/generated/sklearn.base.RegressorMixin.html) in scikit-learn) to that step. Steps in the template are delimited by "-", e.g. "SelectPercentile-Transformer-Classifier". By default value of template is "RandomTree", TPOT generates tree-based pipeline randomly. + +See the template option in tpot section for more details. + + + -memory MEMORY String or file path @@ -641,6 +656,41 @@

    Customizing TPOT's operators

    When using the command-line interface, the configuration file specified in the -config parameter must name its custom TPOT configuration tpot_config. Otherwise, TPOT will not be able to locate the configuration dictionary.

    For more detailed examples of how to customize TPOT's operator configuration, see the default configurations for classification and regression in TPOT's source code.

    Note that you must have all of the corresponding packages for the operators installed on your computer, otherwise TPOT will not be able to use them. For example, if XGBoost is not installed on your computer, then TPOT will simply not import nor use XGBoost in the pipelines it considers.

    +

    Template option in TPOT

    +

    Template option provides a way to specify a desired structure for machine learning pipeline, which may reduce TPOT computation time and potentially provide more interpretable results. Current implementation only supports linear pipelines.

    +

    Below is a simple example to use template option. The pipelines generated/evaluated in TPOT will follow this structure: 1st step is a feature selector (a subclass of SelectorMixin), 2nd step is a feature transformer (a subclass of TransformerMixin) and 3rd step is a classifier for classification (a subclass of ClassifierMixin). The last step must be Classifier for TPOTClassifier's template but Regressor for TPOTRegressor. Note: although SelectorMixin is subclass of TransformerMixin in scikit-leawrn, but Transformer in this option excludes those subclasses of SelectorMixin.

    +
    tpot_obj = TPOTClassifier(
    +                template='Selector-Transformer-Classifier'
    +                )
    +
    + +

    If a specific operator, e.g. SelectPercentile, is prefered to used in the 1st step of pipeline, the template can be defined like 'SelectPercentile-Transformer-Classifier'.

    +

    FeatureSetSelector in TPOT

    +

    FeatureSetSelector is a special new operator in TPOT. This operator enables feature selection based on priori export knowledge. For example, in RNA-seq gene expression analysis, this operator can be used to select one or more gene (feature) set(s) based on GO (Gene Ontology) terms or annotated gene sets Molecular Signatures Database (MSigDB) in the 1st step of pipeline via template option above, in order to reduce dimensions and TPOT computation time. This operator requires a dataset list in csv format. In this csv file, there are only three columns: 1st column is feature set names, 2nd column is the total number of features in one set and 3rd column is a list of feature names (if input X is pandas.DataFrame) or indexes (if input X is numpy.ndarray) delimited by ";". Below is a example how to use this operator in TPOT.

    +

    Please check our preprint paper for more details.

    +
    from tpot import TPOTClassifier
    +import numpy as np
    +import pandas as pd
    +from tpot.config import classifier_config_dict
    +test_data = pd.read_csv("https://raw.githubusercontent.com/EpistasisLab/tpot/master/tests/tests.csv")
    +test_X = test_data.drop("class", axis=1)
    +test_y = test_data['class']
    +
    +# add FeatureSetSelector into tpot configuration
    +classifier_config_dict['tpot.builtins.FeatureSetSelector'] = {
    +    'subset_list': ['https://raw.githubusercontent.com/EpistasisLab/tpot/master/tests/subset_test.csv'],
    +    'sel_subset': [0,1] # select only one feature set, a list of index of subset in the list above
    +    #'sel_subset': list(combinations(range(3), 2)) # select two feature sets
    +}
    +
    +
    +tpot = TPOTClassifier(generations=5,
    +                           population_size=50, verbosity=2,
    +                           template='FeatureSetSelector-Transformer-Classifier',
    +                           config_dict=classifier_config_dict)
    +tpot.fit(test_X, test_y)
    +
    +

    Pipeline caching in TPOT

    With the memory parameter, pipelines can cache the results of each transformer after fitting them. This feature is used to avoid repeated computation by transformers within a pipeline if the parameters and input data are identical to another fitted pipeline during optimization process. TPOT allows users to specify a custom directory path or sklearn.external.joblib.Memory in case they want to re-use the memory cache in future TPOT runs (or a warm_start run).

    There are three methods for enabling memory caching in TPOT:

    @@ -684,8 +734,8 @@

    Parallel Training with Dask

    For large problems or working on Jupyter notebook, we highly recommend that you can distribute the work on a Dask cluster. The dask-examples binder has a runnable example with a small dask cluster.

    -

    To use your Dask cluster to fit a TPOT model, specify the use_dask keyword when you create the TPOT estimator. Note: if use_dask=True, TPOT will use as many cores as available on the your Dask cluster regardless of whether n_jobs is specified.

    -
    estimator = TPOTEstimator(use_dask=True)
    +

    To use your Dask cluster to fit a TPOT model, specify the use_dask keyword when you create the TPOT estimator. Note: if use_dask=True, TPOT will use as many cores as available on the your Dask cluster. If n_jobs is specified, then it will control the chunk size (10*n_jobs if it is less then offspring size) of parallel training.

    +
    estimator = TPOTEstimator(use_dask=True, n_jobs=-1)
     

    This will use use all the workers on your cluster to do the training, and use Dask-ML's pipeline rewriting to avoid re-fitting estimators multiple times on the same set of data. diff --git a/docs_sources/api.md b/docs_sources/api.md index 55c95646..6030c363 100644 --- a/docs_sources/api.md +++ b/docs_sources/api.md @@ -7,6 +7,7 @@ subsample=1.0, n_jobs=1, max_time_mins=None, max_eval_time_mins=5, random_state=None, config_dict=None, + template="RandomTree", warm_start=False, memory=None, use_dask=False, @@ -107,7 +108,7 @@ Setting subsample=0.5 tells TPOT to use a random subsample of half of t

    Number of processes to use in parallel for evaluating pipelines during the TPOT optimization process.

    -Setting n_jobs=-1 will use as many cores as available on the computer. Beware that using multiple processes on the same machine may cause memory issues for large datasets +Setting n_jobs=-1 will use as many cores as available on the computer. For n_jobs below -1, (n_cpus + 1 + n_jobs) are used. Thus for n_jobs = -2, all CPUs but one are used. Beware that using multiple processes on the same machine may cause memory issues for large datasets.
    max_time_mins: integer or None, optional (default=None) @@ -146,6 +147,15 @@ Possible inputs are: See the built-in configurations section for the list of configurations included with TPOT, and the custom configuration section for more information and examples of how to create your own TPOT configurations.
    +template: string (default="RandomTree") +
    +Template of predefined pipeline structure. The option is for specifying a desired structure for the machine learning pipeline evaluated in TPOT. +

    +So far this option only supports linear pipeline structure. Each step in the pipeline should be a main class of operators (Selector, Transformer, Classifier) or a specific operator (e.g. `SelectPercentile`) defined in TPOT operator configuration. If one step is a main class, TPOT will randomly assign all subclass operators (subclasses of [`SelectorMixin`](https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/feature_selection/base.py#L17), [`TransformerMixin`](https://scikit-learn.org/stable/modules/generated/sklearn.base.TransformerMixin.html), [`ClassifierMixin`](https://scikit-learn.org/stable/modules/generated/sklearn.base.ClassifierMixin.html) in scikit-learn) to that step. Steps in the template are delimited by "-", e.g. "SelectPercentile-Transformer-Classifier". By default value of template is "RandomTree", TPOT generates tree-based pipeline randomly. + +See the template option in tpot section for more details. +
    + warm_start: boolean, optional (default=False)
    Flag indicating whether the TPOT instance will reuse the population from previous calls to fit(). @@ -489,6 +499,7 @@ Does not return anything subsample=1.0, n_jobs=1, max_time_mins=None, max_eval_time_mins=5, random_state=None, config_dict=None, + template="RandomTree", warm_start=False, memory=None, use_dask=False, @@ -590,7 +601,7 @@ Setting subsample=0.5 tells TPOT to use a random subsample of half of t
    Number of processes to use in parallel for evaluating pipelines during the TPOT optimization process.

    -Setting n_jobs=-1 will use as many cores as available on the computer. Beware that using multiple processes on the same machine may cause memory issues for large datasets +Setting n_jobs=-1 will use as many cores as available on the computer. For n_jobs below -1, (n_cpus + 1 + n_jobs) are used. Thus for n_jobs = -2, all CPUs but one are used. Beware that using multiple processes on the same machine may cause memory issues for large datasets
    max_time_mins: integer or None, optional (default=None) @@ -629,6 +640,15 @@ Possible inputs are: See the built-in configurations section for the list of configurations included with TPOT, and the custom configuration section for more information and examples of how to create your own TPOT configurations.
    +template: string (default="RandomTree") +
    +Template of predefined pipeline structure. The option is for specifying a desired structure for the machine learning pipeline evaluated in TPOT. +

    +So far this option only supports linear pipeline structure. Each step in the pipeline should be a main class of operators (Selector, Transformer or Regressor) or a specific operator (e.g. `SelectPercentile`) defined in TPOT operator configuration. If one step is a main class, TPOT will randomly assign all subclass operators (subclasses of [`SelectorMixin`](https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/feature_selection/base.py#L17), [`TransformerMixin`](https://scikit-learn.org/stable/modules/generated/sklearn.base.TransformerMixin.html) or [`RegressorMixin`](https://scikit-learn.org/stable/modules/generated/sklearn.base.RegressorMixin.html) in scikit-learn) to that step. Steps in the template are delimited by "-", e.g. "SelectPercentile-Transformer-Regressor". By default value of template is "RandomTree", TPOT generates tree-based pipeline randomly. + +See the template option in tpot section for more details. +
    + warm_start: boolean, optional (default=False)
    Flag indicating whether the TPOT instance will reuse the population from previous calls to fit(). diff --git a/docs_sources/using.md b/docs_sources/using.md index 4662456b..48551932 100644 --- a/docs_sources/using.md +++ b/docs_sources/using.md @@ -240,7 +240,7 @@ See the section on scoring functions for more d Any positive integer or -1 Number of CPUs for evaluating pipelines in parallel during the TPOT optimization process.

    -Assigning this to -1 will use as many cores as available on the computer. +Assigning this to -1 will use as many cores as available on the computer. For n_jobs below -1, (n_cpus + 1 + n_jobs) are used. Thus for n_jobs = -2, all CPUs but one are used. -maxtime @@ -282,6 +282,15 @@ See the built-in configurations +-template +TEMPLATE +String +Template of predefined pipeline structure. The option is for specifying a desired structure for the machine learning pipeline evaluated in TPOT. So far this option only supports linear pipeline structure. Each step in the pipeline should be a main class of operators (Selector, Transformer, Classifier or Regressor) or a specific operator (e.g. `SelectPercentile`) defined in TPOT operator configuration. If one step is a main class, TPOT will randomly assign all subclass operators (subclasses of [`SelectorMixin`](https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/feature_selection/base.py#L17), [`TransformerMixin`](https://scikit-learn.org/stable/modules/generated/sklearn.base.TransformerMixin.html), [`ClassifierMixin`](https://scikit-learn.org/stable/modules/generated/sklearn.base.ClassifierMixin.html) or [`RegressorMixin`](https://scikit-learn.org/stable/modules/generated/sklearn.base.RegressorMixin.html) in scikit-learn) to that step. Steps in the template are delimited by "-", e.g. "SelectPercentile-Transformer-Classifier". By default value of template is "RandomTree", TPOT generates tree-based pipeline randomly. + +See the template option in tpot section for more details. + + + -memory MEMORY String or file path @@ -526,6 +535,53 @@ For more detailed examples of how to customize TPOT's operator configuration, se Note that you must have all of the corresponding packages for the operators installed on your computer, otherwise TPOT will not be able to use them. For example, if XGBoost is not installed on your computer, then TPOT will simply not import nor use XGBoost in the pipelines it considers. + +# Template option in TPOT + +Template option provides a way to specify a desired structure for machine learning pipeline, which may reduce TPOT computation time and potentially provide more interpretable results. Current implementation only supports linear pipelines. + +Below is a simple example to use `template` option. The pipelines generated/evaluated in TPOT will follow this structure: 1st step is a feature selector (a subclass of [`SelectorMixin`](https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/feature_selection/base.py#L17)), 2nd step is a feature transformer (a subclass of [`TransformerMixin`](https://scikit-learn.org/stable/modules/generated/sklearn.base.TransformerMixin.html)) and 3rd step is a classifier for classification (a subclass of [`ClassifierMixin`](https://scikit-learn.org/stable/modules/generated/sklearn.base.ClassifierMixin.html)). The last step must be `Classifier` for `TPOTClassifier`'s template but `Regressor` for `TPOTRegressor`. **Note: although `SelectorMixin` is subclass of `TransformerMixin` in scikit-leawrn, but `Transformer` in this option excludes those subclasses of `SelectorMixin`.** + +```Python +tpot_obj = TPOTClassifier( + template='Selector-Transformer-Classifier' + ) +``` + +If a specific operator, e.g. `SelectPercentile`, is prefered to used in the 1st step of pipeline, the template can be defined like 'SelectPercentile-Transformer-Classifier'. + + +# FeatureSetSelector in TPOT + +`FeatureSetSelector` is a special new operator in TPOT. This operator enables feature selection based on *priori* export knowledge. For example, in RNA-seq gene expression analysis, this operator can be used to select one or more gene (feature) set(s) based on GO (Gene Ontology) terms or annotated gene sets Molecular Signatures Database ([MSigDB](http://software.broadinstitute.org/gsea/msigdb/index.jsp)) in the 1st step of pipeline via `template` option above, in order to reduce dimensions and TPOT computation time. This operator requires a dataset list in csv format. In this csv file, there are only three columns: 1st column is feature set names, 2nd column is the total number of features in one set and 3rd column is a list of feature names (if input X is pandas.DataFrame) or indexes (if input X is numpy.ndarray) delimited by ";". Below is a example how to use this operator in TPOT. + +Please check our [preprint paper](https://www.biorxiv.org/content/10.1101/502484v1.article-info) for more details. + +```Python +from tpot import TPOTClassifier +import numpy as np +import pandas as pd +from tpot.config import classifier_config_dict +test_data = pd.read_csv("https://raw.githubusercontent.com/EpistasisLab/tpot/master/tests/tests.csv") +test_X = test_data.drop("class", axis=1) +test_y = test_data['class'] + +# add FeatureSetSelector into tpot configuration +classifier_config_dict['tpot.builtins.FeatureSetSelector'] = { + 'subset_list': ['https://raw.githubusercontent.com/EpistasisLab/tpot/master/tests/subset_test.csv'], + 'sel_subset': [0,1] # select only one feature set, a list of index of subset in the list above + #'sel_subset': list(combinations(range(3), 2)) # select two feature sets +} + + +tpot = TPOTClassifier(generations=5, + population_size=50, verbosity=2, + template='FeatureSetSelector-Transformer-Classifier', + config_dict=classifier_config_dict) +tpot.fit(test_X, test_y) +``` + + # Pipeline caching in TPOT With the `memory` parameter, pipelines can cache the results of each transformer after fitting them. This feature is used to avoid repeated computation by transformers within a pipeline if the parameters and input data are identical to another fitted pipeline during optimization process. TPOT allows users to specify a custom directory path or [`sklearn.external.joblib.Memory`](https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/externals/joblib/memory.py#L847) in case they want to re-use the memory cache in future TPOT runs (or a `warm_start` run). @@ -581,10 +637,10 @@ For large problems or working on Jupyter notebook, we highly recommend that you The [dask-examples binder](https://mybinder.org/v2/gh/dask/dask-examples/master?filepath=machine-learning%2Ftpot.ipynb) has a runnable example with a small dask cluster. -To use your Dask cluster to fit a TPOT model, specify the ``use_dask`` keyword when you create the TPOT estimator. **Note: if `use_dask=True`, TPOT will use as many cores as available on the your Dask cluster regardless of whether `n_jobs` is specified.** +To use your Dask cluster to fit a TPOT model, specify the ``use_dask`` keyword when you create the TPOT estimator. **Note: if `use_dask=True`, TPOT will use as many cores as available on the your Dask cluster. If `n_jobs` is specified, then it will control the chunk size (10*`n_jobs` if it is less then offspring size) of parallel training. ** ```python -estimator = TPOTEstimator(use_dask=True) +estimator = TPOTEstimator(use_dask=True, n_jobs=-1) ``` This will use use all the workers on your cluster to do the training, and use [Dask-ML's pipeline rewriting](https://dask-ml.readthedocs.io/en/latest/hyper-parameter-search.html#avoid-repeated-work) to avoid re-fitting estimators multiple times on the same set of data. diff --git a/tests/driver_tests.py b/tests/driver_tests.py index 3204012a..7360135b 100644 --- a/tests/driver_tests.py +++ b/tests/driver_tests.py @@ -252,6 +252,7 @@ def test_default_param(self): self.assertEqual(args.SUBSAMPLE, 1.0) self.assertEqual(args.SCORING_FN, None) self.assertEqual(args.TARGET_NAME, 'class') + self.assertEqual(args.TEMPLATE, 'RandomTree') self.assertEqual(args.TPOT_MODE, 'classification') self.assertEqual(args.VERBOSITY, 1) @@ -288,6 +289,7 @@ def test_print_args(self): SCORING_FN = accuracy SUBSAMPLE = 1.0 TARGET_NAME = class +TEMPLATE = RandomTree TPOT_MODE = classification VERBOSITY = 1 @@ -330,6 +332,7 @@ def test_print_args_2(self): SCORING_FN = neg_mean_squared_error SUBSAMPLE = 1.0 TARGET_NAME = class +TEMPLATE = RandomTree TPOT_MODE = regression VERBOSITY = 1 diff --git a/tests/export_tests.py b/tests/export_tests.py index 2c368090..99e822ed 100644 --- a/tests/export_tests.py +++ b/tests/export_tests.py @@ -62,16 +62,14 @@ def test_export_random_ind(): """Assert that the TPOTClassifier can generate the same pipeline export with random seed of 39.""" - tpot_obj = TPOTClassifier(random_state=39) + tpot_obj = TPOTClassifier(random_state=39, config_dict="TPOT light") tpot_obj._fit_init() tpot_obj._pbar = tqdm(total=1, disable=True) pipeline = tpot_obj._toolbox.individual() expected_code = """import numpy as np import pandas as pd -from sklearn.feature_selection import SelectPercentile, f_classif from sklearn.model_selection import train_test_split -from sklearn.pipeline import make_pipeline -from sklearn.tree import DecisionTreeClassifier +from sklearn.naive_bayes import BernoulliNB # NOTE: Make sure that the class is labeled 'target' in the data file tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64) @@ -79,10 +77,7 @@ def test_export_random_ind(): training_features, testing_features, training_target, testing_target = \\ train_test_split(features, tpot_data['target'].values, random_state=39) -exported_pipeline = make_pipeline( - SelectPercentile(score_func=f_classif, percentile=65), - DecisionTreeClassifier(criterion="gini", max_depth=7, min_samples_leaf=4, min_samples_split=18) -) +exported_pipeline = BernoulliNB(alpha=1.0, fit_prior=False) exported_pipeline.fit(training_features, training_target) results = exported_pipeline.predict(testing_features) diff --git a/tests/feature_set_selector_tests.py b/tests/feature_set_selector_tests.py new file mode 100644 index 00000000..a5dc9b47 --- /dev/null +++ b/tests/feature_set_selector_tests.py @@ -0,0 +1,125 @@ +# -*- coding: utf-8 -*- + +"""This file is part of the TPOT library. + +TPOT was primarily developed at the University of Pennsylvania by: + - Randal S. Olson (rso@randalolson.com) + - Weixuan Fu (weixuanf@upenn.edu) + - Daniel Angell (dpa34@drexel.edu) + - and many more generous open source contributors + +TPOT is free software: you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as +published by the Free Software Foundation, either version 3 of +the License, or (at your option) any later version. + +TPOT is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU Lesser General Public License for more details. + +You should have received a copy of the GNU Lesser General Public +License along with TPOT. If not, see . + +""" + +import numpy as np +import pandas as pd +from tpot.builtins import FeatureSetSelector +from nose.tools import assert_raises + +test_data = pd.read_csv("tests/tests.csv") +test_X = test_data.drop("class", axis=1) + + +def test_FeatureSetSelector_1(): + """Assert that the StackingEstimator returns transformed X based on test feature list 1.""" + ds = FeatureSetSelector(subset_list="tests/subset_test.csv", sel_subset="test_subset_1") + ds.fit(test_X, y=None) + transformed_X = ds.transform(test_X) + + assert transformed_X.shape[0] == test_X.shape[0] + assert transformed_X.shape[1] != test_X.shape[1] + assert transformed_X.shape[1] == 5 + assert np.array_equal(transformed_X, test_X[ds.feat_list].values) + +def test_FeatureSetSelector_2(): + """Assert that the StackingEstimator returns transformed X based on test feature list 2.""" + ds = FeatureSetSelector(subset_list="tests/subset_test.csv", sel_subset="test_subset_2") + ds.fit(test_X, y=None) + transformed_X = ds.transform(test_X) + + assert transformed_X.shape[0] == test_X.shape[0] + assert transformed_X.shape[1] != test_X.shape[1] + assert transformed_X.shape[1] == 6 + assert np.array_equal(transformed_X, test_X[ds.feat_list].values) + +def test_FeatureSetSelector_3(): + """Assert that the StackingEstimator returns transformed X based on 2 subsets' names""" + ds = FeatureSetSelector(subset_list="tests/subset_test.csv", sel_subset=["test_subset_1", "test_subset_2"]) + ds.fit(test_X, y=None) + transformed_X = ds.transform(test_X) + + assert transformed_X.shape[0] == test_X.shape[0] + assert transformed_X.shape[1] != test_X.shape[1] + assert transformed_X.shape[1] == 7 + assert np.array_equal(transformed_X, test_X[ds.feat_list].values) + +def test_FeatureSetSelector_4(): + """Assert that the StackingEstimator returns transformed X based on 2 subsets' indexs""" + ds = FeatureSetSelector(subset_list="tests/subset_test.csv", sel_subset=[0, 1]) + ds.fit(test_X, y=None) + transformed_X = ds.transform(test_X) + + assert transformed_X.shape[0] == test_X.shape[0] + assert transformed_X.shape[1] != test_X.shape[1] + assert transformed_X.shape[1] == 7 + assert np.array_equal(transformed_X, test_X[ds.feat_list].values) + +def test_FeatureSetSelector_5(): + """Assert that the StackingEstimator returns transformed X seleced based on test feature list 1's index.""" + ds = FeatureSetSelector(subset_list="tests/subset_test.csv", sel_subset=0) + ds.fit(test_X, y=None) + transformed_X = ds.transform(test_X) + + assert transformed_X.shape[0] == test_X.shape[0] + assert transformed_X.shape[1] != test_X.shape[1] + assert transformed_X.shape[1] == 5 + assert np.array_equal(transformed_X, test_X[ds.feat_list].values) + +def test_FeatureSetSelector_6(): + """Assert that the _get_support_mask function returns correct mask.""" + ds = FeatureSetSelector(subset_list="tests/subset_test.csv", sel_subset="test_subset_1") + ds.fit(test_X, y=None) + mask = ds._get_support_mask() + get_mask = ds.get_support() + + assert mask.shape[0] == 30 + assert np.count_nonzero(mask) == 5 + assert np.array_equal(get_mask, mask) + +def test_FeatureSetSelector_7(): + """Assert that the StackingEstimator works as expected when input X is np.array.""" + ds = FeatureSetSelector(subset_list="tests/subset_test.csv", sel_subset="test_subset_1") + ds.fit(test_X.values, y=None) + transformed_X = ds.transform(test_X.values) + str_feat_list = [str(i+2) for i in ds.feat_list_idx] + + + assert transformed_X.shape[0] == test_X.shape[0] + assert transformed_X.shape[1] != test_X.shape[1] + assert transformed_X.shape[1] == 5 + assert np.array_equal(transformed_X, test_X.values[:, ds.feat_list_idx]) + assert np.array_equal(transformed_X, test_X[str_feat_list].values) + + +def test_FeatureSetSelector_8(): + """Assert that the StackingEstimator rasies ValueError when features are not available.""" + ds = FeatureSetSelector(subset_list="tests/subset_test.csv", sel_subset="test_subset_4") + assert_raises(ValueError, ds.fit, test_X) + + +def test_FeatureSetSelector_9(): + """Assert that the StackingEstimator __name__ returns correct class name.""" + ds = FeatureSetSelector(subset_list="tests/subset_test.csv", sel_subset="test_subset_4") + assert ds.__name__ == 'FeatureSetSelector' diff --git a/tests/stats_test.py b/tests/stats_test.py index 8b757bd9..66c6873a 100644 --- a/tests/stats_test.py +++ b/tests/stats_test.py @@ -122,7 +122,7 @@ def test_mut_operator_stats_update(): for _ in range(10): offspring, = tpot_obj._random_mutation_operator(ind) - + assert offspring.statistics['crossover_count'] == ind.statistics['crossover_count'] assert offspring.statistics['mutation_count'] == ind.statistics['mutation_count'] + 1 assert offspring.statistics['predecessor'] == (str(ind),) diff --git a/tests/subset_test.csv b/tests/subset_test.csv new file mode 100644 index 00000000..612e8665 --- /dev/null +++ b/tests/subset_test.csv @@ -0,0 +1,5 @@ +Subset,Size,Features +test_subset_1,5,2;4;9;11;14 +test_subset_2,6,2;3;4;5;9;11 +test_subset_3,4,12;7;9;11 +test_subset_4,2,33;34 diff --git a/tests/test_dask_based.py b/tests/test_dask_based.py index b4daf0f4..ad94f8b3 100644 --- a/tests/test_dask_based.py +++ b/tests/test_dask_based.py @@ -27,7 +27,7 @@ def test_dask_matches(self): cv=3, random_state=42, n_jobs=n_jobs, - use_dask=False, + use_dask=False ) b = TPOTClassifier( generations=0, @@ -35,7 +35,7 @@ def test_dask_matches(self): cv=3, random_state=42, n_jobs=n_jobs, - use_dask=True, + use_dask=True ) a.fit(X, y) b.fit(X, y) diff --git a/tests/tpot_tests.py b/tests/tpot_tests.py index 697689bb..92e4a31d 100644 --- a/tests/tpot_tests.py +++ b/tests/tpot_tests.py @@ -30,7 +30,6 @@ from tpot.gp_deap import mutNodeReplacement, _wrapped_cross_val_score, pick_two_individuals_eligible_for_crossover, cxOnePoint, varOr, initialize_stats_dict from tpot.metrics import balanced_accuracy, SCORERS from tpot.operator_utils import TPOTOperatorClassFactory, set_sample_weight, source_decode -from tpot.decorators import pretest_X, pretest_y from tpot.config.classifier import classifier_config_dict from tpot.config.classifier_light import classifier_config_dict_light @@ -54,10 +53,12 @@ from tempfile import mkdtemp from shutil import rmtree -from sklearn.datasets import load_digits, load_boston +from sklearn.datasets import load_digits, load_boston, make_classification from sklearn.model_selection import train_test_split, cross_val_score, GroupKFold from sklearn.externals.joblib import Memory from sklearn.metrics import make_scorer, roc_auc_score +from sklearn.base import BaseEstimator, ClassifierMixin, RegressorMixin, TransformerMixin +from sklearn.feature_selection.base import SelectorMixin from deap import creator, gp from deap.tools import ParetoFront from nose.tools import assert_raises, assert_not_equal, assert_greater_equal, assert_equal, assert_in @@ -94,6 +95,9 @@ def closing(arg): training_features_r, testing_features_r, training_target_r, testing_target_r = \ train_test_split(boston_data.data, boston_data.target, random_state=42) +# Set up a small test dataset + +pretest_X, pretest_y = make_classification(n_samples=100, n_features=10, random_state=42) # Set up pandas DataFrame for testing input_data = pd.read_csv( @@ -279,6 +283,8 @@ def test_init_n_jobs(): """Assert that the TPOT init stores current number of processes.""" tpot_obj = TPOTClassifier(n_jobs=2) assert tpot_obj.n_jobs == 2 + tpot_obj._fit_init() + assert tpot_obj._n_jobs == 2 tpot_obj = TPOTClassifier(n_jobs=-1) assert tpot_obj.n_jobs == -1 @@ -286,6 +292,23 @@ def test_init_n_jobs(): assert tpot_obj._n_jobs == cpu_count() +def test_init_n_jobs_2(): + """Assert that the TPOT init assign right""" + tpot_obj = TPOTClassifier(n_jobs=-2) + assert tpot_obj.n_jobs == -2 + + tpot_obj._fit_init() + assert tpot_obj._n_jobs == cpu_count() - 1 + + +def test_init_n_jobs_3(): + """Assert that the TPOT init rasies ValueError if n_jobs=0.""" + tpot_obj = TPOTClassifier(n_jobs=0) + assert tpot_obj.n_jobs == 0 + + assert_raises(ValueError, tpot_obj._fit_init) + + def test_timeout(): """Assert that _wrapped_cross_val_score return Timeout in a time limit.""" tpot_obj = TPOTRegressor(scoring='neg_mean_squared_error') @@ -595,6 +618,90 @@ def test_sample_weight_func(): assert np.allclose(known_score, score) +def test_template_1(): + """Assert that TPOT template option generates pipeline when each step is a type of operator.""" + + tpot_obj = TPOTClassifier( + random_state=42, + verbosity=0, + template='Selector-Transformer-Classifier' + ) + tpot_obj._fit_init() + pop = tpot_obj._toolbox.population(n=10) + for deap_pipeline in pop: + operator_count = tpot_obj._operator_count(deap_pipeline) + sklearn_pipeline = tpot_obj._toolbox.compile(expr=deap_pipeline) + assert operator_count == 3 + assert issubclass(sklearn_pipeline.steps[0][1].__class__, SelectorMixin) + assert issubclass(sklearn_pipeline.steps[1][1].__class__, TransformerMixin) + assert issubclass(sklearn_pipeline.steps[2][1].__class__, ClassifierMixin) + + +def test_template_2(): + """Assert that TPOT template option generates pipeline when each step is operator type with a duplicate main type.""" + + tpot_obj = TPOTClassifier( + random_state=42, + verbosity=0, + template='Selector-Selector-Transformer-Classifier' + ) + tpot_obj._fit_init() + pop = tpot_obj._toolbox.population(n=10) + for deap_pipeline in pop: + operator_count = tpot_obj._operator_count(deap_pipeline) + sklearn_pipeline = tpot_obj._toolbox.compile(expr=deap_pipeline) + assert operator_count == 4 + assert issubclass(sklearn_pipeline.steps[0][1].__class__, SelectorMixin) + assert issubclass(sklearn_pipeline.steps[1][1].__class__, SelectorMixin) + assert issubclass(sklearn_pipeline.steps[2][1].__class__, TransformerMixin) + assert issubclass(sklearn_pipeline.steps[3][1].__class__, ClassifierMixin) + + +def test_template_3(): + """Assert that TPOT template option generates pipeline when one of steps is a specific operator.""" + + tpot_obj = TPOTClassifier( + random_state=42, + verbosity=0, + template='SelectPercentile-Transformer-Classifier' + ) + tpot_obj._fit_init() + pop = tpot_obj._toolbox.population(n=10) + for deap_pipeline in pop: + operator_count = tpot_obj._operator_count(deap_pipeline) + sklearn_pipeline = tpot_obj._toolbox.compile(expr=deap_pipeline) + assert operator_count == 3 + assert sklearn_pipeline.steps[0][0] == 'SelectPercentile'.lower() + assert issubclass(sklearn_pipeline.steps[0][1].__class__, SelectorMixin) + assert issubclass(sklearn_pipeline.steps[1][1].__class__, TransformerMixin) + assert issubclass(sklearn_pipeline.steps[2][1].__class__, ClassifierMixin) + + +def test_template_4(): + """Assert that TPOT template option generates pipeline when one of steps is a specific operator.""" + + tpot_obj = TPOTClassifier( + population_size=5, + generations=2, + random_state=42, + verbosity=0, + config_dict = 'TPOT light', + template='SelectPercentile-Transformer-Classifier' + ) + tpot_obj.fit(pretest_X, pretest_y) + + assert isinstance(tpot_obj._optimized_pipeline, creator.Individual) + assert not (tpot_obj._start_datetime is None) + + sklearn_pipeline = tpot_obj.fitted_pipeline_ + operator_count = tpot_obj._operator_count(tpot_obj._optimized_pipeline) + assert operator_count == 3 + assert sklearn_pipeline.steps[0][0] == 'SelectPercentile'.lower() + assert issubclass(sklearn_pipeline.steps[0][1].__class__, SelectorMixin) + assert issubclass(sklearn_pipeline.steps[1][1].__class__, TransformerMixin) + assert issubclass(sklearn_pipeline.steps[2][1].__class__, ClassifierMixin) + + def test_fit_GroupKFold(): """Assert that TPOT properly handles the group parameter when using GroupKFold.""" # This check tests if the darker MNIST images would generalize to the lighter ones. @@ -933,18 +1040,23 @@ def test_memory_3(): def test_memory_4(): - """Assert that the TPOT _setup_memory function rasies ValueError with a invalid path.""" + """Assert that the TPOT _setup_memory function create a directory which does not exist.""" + cachedir = mkdtemp() + dir = cachedir + '/test' tpot_obj = TPOTClassifier( random_state=42, population_size=1, offspring_size=2, generations=1, config_dict='TPOT light', - memory="./fake_temp_dir", + memory=dir, verbosity=0 ) + tpot_obj._setup_memory() + assert os.path.isdir(dir) + rmtree(cachedir) + - assert_raises(ValueError, tpot_obj._setup_memory) def test_memory_5(): @@ -1565,6 +1677,7 @@ def test_check_dataset(): verbosity=0, config_dict='TPOT light' ) + tpot_obj._fit_init() ret_features, ret_target = tpot_obj._check_dataset(training_features, training_target) assert np.allclose(ret_features, training_features) @@ -1581,6 +1694,7 @@ def test_check_dataset_2(): verbosity=0, config_dict='TPOT light' ) + tpot_obj._fit_init() test_sample_weight = list(range(1, len(training_target)+1)) ret_features, ret_target = tpot_obj._check_dataset(training_features, training_target, test_sample_weight) test_sample_weight[0] = 'opps' @@ -1598,6 +1712,7 @@ def test_check_dataset_3(): verbosity=0, config_dict='TPOT light' ) + tpot_obj._fit_init() test_sample_weight = list(range(1, len(training_target)+1)) ret_features, ret_target = tpot_obj._check_dataset(training_features, training_target, test_sample_weight) test_sample_weight[0] = np.nan @@ -1615,6 +1730,7 @@ def test_check_dataset_4(): verbosity=0, config_dict='TPOT light' ) + tpot_obj._fit_init() test_sample_weight = list(range(1, len(training_target))) assert_raises(ValueError, tpot_obj._check_dataset, training_features, training_target, test_sample_weight) @@ -1629,7 +1745,7 @@ def test_check_dataset_5(): verbosity=0, config_dict='TPOT light' ) - + tpot_obj._fit_init() ret_features = tpot_obj._check_dataset(training_features, target=None) assert np.allclose(ret_features, training_features) @@ -1829,7 +1945,7 @@ def test_tpot_operator_factory_class(): assert len(tpot_argument_list) == 9 assert tpot_operator_list[0].root is True assert tpot_operator_list[1].root is False - assert tpot_operator_list[2].type() == "Classifier or Regressor" + assert tpot_operator_list[2].type() == "Classifier" assert tpot_argument_list[1].values == [True, False] @@ -1869,8 +1985,8 @@ def pareto_eq(ind1, ind2): pop = tpot_obj._evaluate_individuals(pipelines, pretest_X, pretest_y) fitness_scores = [ind.fitness.values for ind in pop] - known_scores = [(2, 0.94000000000000006), (5000.0, -float('inf'))] - assert np.allclose(known_scores, fitness_scores) + assert fitness_scores[0][0] == 2 + assert fitness_scores[1][0] == 5000.0 def test_pick_two_individuals_eligible_for_crossover(): @@ -2159,7 +2275,7 @@ def test_varOr_3(): def test_operator_type(): """Assert that TPOT operators return their type, e.g. 'Classifier', 'Preprocessor'.""" - assert TPOTSelectPercentile.type() == "Preprocessor or Selector" + assert TPOTSelectPercentile.type() == "Selector" def test_gen(): diff --git a/tpot/_version.py b/tpot/_version.py index ab687bdd..cc571e81 100644 --- a/tpot/_version.py +++ b/tpot/_version.py @@ -23,4 +23,4 @@ """ -__version__ = '0.9.6' +__version__ = '0.10.0' diff --git a/tpot/base.py b/tpot/base.py index 80e0edee..0453e5da 100644 --- a/tpot/base.py +++ b/tpot/base.py @@ -40,6 +40,7 @@ from shutil import rmtree import numpy as np +from pandas import DataFrame from scipy import sparse import deap from deap import base, creator, tools, gp @@ -105,7 +106,7 @@ def __init__(self, generations=100, population_size=100, offspring_size=None, mutation_rate=0.9, crossover_rate=0.1, scoring=None, cv=5, subsample=1.0, n_jobs=1, max_time_mins=None, max_eval_time_mins=5, - random_state=None, config_dict=None, + random_state=None, config_dict=None, template='RandomTree', warm_start=False, memory=None, use_dask=False, periodic_checkpoint_folder=None, early_stop=None, verbosity=0, disable_update_check=False): @@ -172,7 +173,8 @@ def __init__(self, generations=100, population_size=100, offspring_size=None, n_jobs: int, optional (default: 1) Number of CPUs for evaluating pipelines in parallel during the TPOT optimization process. Assigning this to -1 will use as many cores as available - on the computer. + on the computer. For n_jobs below -1, (n_cpus + 1 + n_jobs) are used. + Thus for n_jobs = -2, all CPUs but one are used. max_time_mins: int, optional (default: None) How many minutes TPOT has to optimize the pipeline. If provided, this setting will override the "generations" parameter and allow @@ -203,6 +205,16 @@ def __init__(self, generations=100, population_size=100, offspring_size=None, String 'TPOT sparse': TPOT uses a configuration dictionary with a one-hot-encoder and the operators normally included in TPOT that also support sparse matrices. + template: string (default: "RandomTree") + Template of predefined pipeline structure. The option is for specifying a desired structure + for the machine learning pipeline evaluated in TPOT. So far this option only supports + linear pipeline structure. Each step in the pipeline should be a main class of operators + (Selector, Transformer, Classifier or Regressor) or a specific operator + (e.g. SelectPercentile) defined in TPOT operator configuration. If one step is a main class, + TPOT will randomly assign all subclass operators (subclasses of SelectorMixin, + TransformerMixin, ClassifierMixin or RegressorMixin in scikit-learn) to that step. + Steps in the template are delimited by "-", e.g. "SelectPercentile-Transformer-Classifier". + By default value of template is "RandomTree", TPOT generates tree-based pipeline randomly. warm_start: bool, optional (default: False) Flag indicating whether the TPOT instance will reuse the population from previous calls to fit(). @@ -214,7 +226,8 @@ def __init__(self, generations=100, population_size=100, offspring_size=None, TPOT uses memory caching with a temporary directory and cleans it up upon shutdown. String path of a caching directory TPOT uses memory caching with the provided directory and TPOT does NOT clean - the caching directory up upon shutdown. + the caching directory up upon shutdown. If the directory does not exist, TPOT will + create it. Memory object: TPOT uses the instance of sklearn.external.joblib.Memory for memory caching, and TPOT does NOT clean the caching directory up upon shutdown. @@ -268,6 +281,7 @@ def __init__(self, generations=100, population_size=100, offspring_size=None, self.periodic_checkpoint_folder = periodic_checkpoint_folder self.early_stop = early_stop self.config_dict = config_dict + self.template = template self.warm_start = warm_start self.memory = memory self.use_dask = use_dask @@ -276,6 +290,28 @@ def __init__(self, generations=100, population_size=100, offspring_size=None, self.random_state = random_state + def _setup_template(self, template): + self.template = template + self.template_comp = template.split('-') + if self.template == 'RandomTree': + self._min = 1 + self._max = 3 + else: + self._min = 0 + self._max = 1 + for comp in self.template_comp: + if comp == 'CombineDFs': + self._max += 2 + self._min += 1 + else: + self._max += 1 + self._min += 1 + if self._max - self._min == 1: + self.tree_structure = False + else: + self.tree_structure = True + + def _setup_scoring_function(self, scoring): if scoring: if isinstance(scoring, str): @@ -350,6 +386,7 @@ def _setup_config(self, config_dict): else: self._config_dict = self.default_config_dict + def _read_config_file(self, config_path): if os.path.isfile(config_path): try: @@ -370,6 +407,7 @@ def _read_config_file(self, config_path): '{}'.format(config_path) ) + def _setup_pset(self): if self.random_state is not None: random.seed(self.random_state) @@ -378,39 +416,89 @@ def _setup_pset(self): self._pset = gp.PrimitiveSetTyped('MAIN', [np.ndarray], Output_Array) self._pset.renameArguments(ARG0='input_matrix') self._add_operators() - self._add_terminals() if self.verbosity > 2: print('{} operators have been imported by TPOT.'.format(len(self.operators))) + def _add_operators(self): - for operator in self.operators: - if operator.root: - # We need to add rooted primitives twice so that they can - # return both an Output_Array (and thus be the root of the tree), - # and return a np.ndarray so they can exist elsewhere in the tree. - p_types = (operator.parameter_types()[0], Output_Array) - self._pset.addPrimitive(operator, *p_types) - - self._pset.addPrimitive(operator, *operator.parameter_types()) - - # Import required modules into local namespace so that pipelines - # may be evaluated directly - for key in sorted(operator.import_hash.keys()): - module_list = ', '.join(sorted(operator.import_hash[key])) - - if key.startswith('tpot.'): - exec('from {} import {}'.format(key[4:], module_list)) - else: - exec('from {} import {}'.format(key, module_list)) + main_type = ["Classifier", "Regressor", "Selector", "Transformer"] + ret_types = [] + self.op_list = [] + if self.template == "RandomTree": # default pipeline structure + step_in_type = np.ndarray + step_ret_type = Output_Array + for operator in self.operators: + arg_types = operator.parameter_types()[0][1:] + p_types = ([step_in_type] + arg_types, step_ret_type) + if operator.root: + # We need to add rooted primitives twice so that they can + # return both an Output_Array (and thus be the root of the tree), + # and return a np.ndarray so they can exist elsewhere in the tree. + self._pset.addPrimitive(operator, *p_types) + tree_p_types = ([step_in_type] + arg_types, step_in_type) + self._pset.addPrimitive(operator, *tree_p_types) + self._import_hash_and_add_terminals(operator, arg_types) + self._pset.addPrimitive(CombineDFs(), [step_in_type, step_in_type], step_in_type) + else: + gp_types = {} + for idx, step in enumerate(self.template_comp): - for var in operator.import_hash[key]: - self.operators_context[var] = eval(var) + # input class in each step + if idx: + step_in_type = ret_types[-1] + else: + step_in_type = np.ndarray + if step != 'CombineDFs': + if idx < len(self.template_comp) - 1: + # create an empty for returning class for strongly-type GP + step_ret_type_name = 'Ret_{}'.format(idx) + step_ret_type = type(step_ret_type_name, (object,), {}) + ret_types.append(step_ret_type) + else: + step_ret_type = Output_Array + if step == 'CombineDFs': + self._pset.addPrimitive(CombineDFs(), [step_in_type, step_in_type], step_in_type) + elif main_type.count(step): # if the step is a main type + for operator in self.operators: + arg_types = operator.parameter_types()[0][1:] + if operator.type() == step: + p_types = ([step_in_type] + arg_types, step_ret_type) + self._pset.addPrimitive(operator, *p_types) + self._import_hash_and_add_terminals(operator, arg_types) + else: # is the step is a specific operator + for operator in self.operators: + arg_types = operator.parameter_types()[0][1:] + if operator.__name__ == step: + p_types = ([step_in_type] + arg_types, step_ret_type) + self._pset.addPrimitive(operator, *p_types) + self._import_hash_and_add_terminals(operator, arg_types) + self.ret_types = [np.ndarray, Output_Array] + ret_types + + + def _import_hash_and_add_terminals(self, operator, arg_types): + if not self.op_list.count(operator.__name__): + self._import_hash(operator) + self._add_terminals(arg_types) + self.op_list.append(operator.__name__) + + + def _import_hash(self, operator): + # Import required modules into local namespace so that pipelines + # may be evaluated directly + for key in sorted(operator.import_hash.keys()): + module_list = ', '.join(sorted(operator.import_hash[key])) + + if key.startswith('tpot.'): + exec('from {} import {}'.format(key[4:], module_list)) + else: + exec('from {} import {}'.format(key, module_list)) - self._pset.addPrimitive(CombineDFs(), [np.ndarray, np.ndarray], np.ndarray) + for var in operator.import_hash[key]: + self.operators_context[var] = eval(var) - def _add_terminals(self): - for _type in self.arguments: + def _add_terminals(self, arg_types): + for _type in arg_types: type_values = list(_type.values) for val in type_values: @@ -424,13 +512,16 @@ def _setup_toolbox(self): creator.create('Individual', gp.PrimitiveTree, fitness=creator.FitnessMulti, statistics=dict) self._toolbox = base.Toolbox() - self._toolbox.register('expr', self._gen_grow_safe, pset=self._pset, min_=1, max_=3) + self._toolbox.register('expr', self._gen_grow_safe, pset=self._pset, min_=self._min, max_=self._max) self._toolbox.register('individual', tools.initIterate, creator.Individual, self._toolbox.expr) self._toolbox.register('population', tools.initRepeat, list, self._toolbox.individual) self._toolbox.register('compile', self._compile_to_sklearn) self._toolbox.register('select', tools.selNSGA2) self._toolbox.register('mate', self._mate_operator) - self._toolbox.register('expr_mut', self._gen_grow_safe, min_=1, max_=4) + if self.tree_structure: + self._toolbox.register('expr_mut', self._gen_grow_safe, min_=self._min, max_=self._max + 1) + else: + self._toolbox.register('expr_mut', self._gen_grow_safe, min_=self._min, max_=self._max) self._toolbox.register('mutate', self._random_mutation_operator) @@ -459,6 +550,8 @@ def _fit_init(self): self._setup_config(self.config_dict) + self._setup_template(self.template) + self.operators = [] self.arguments = [] for key in sorted(self._config_dict.keys()): @@ -512,8 +605,12 @@ def _fit_init(self): 'The subsample ratio of the training instance must be in the range (0.0, 1.0].' ) - if self.n_jobs == -1: - self._n_jobs = cpu_count() + if self.n_jobs == 0: + raise ValueError( + 'The value 0 of n_jobs is invalid.' + ) + elif self.n_jobs < 0: + self._n_jobs = cpu_count() + 1 + self.n_jobs else: self._n_jobs = self.n_jobs @@ -560,12 +657,16 @@ def fit(self, features, target, sample_weight=None, groups=None): """ self._fit_init() - features, target = self._check_dataset(features, target, sample_weight) + + self.pretest_X, _, self.pretest_y, _ = train_test_split(features, + target, train_size=min(50, int(0.9*features.shape[0])), + test_size=None, random_state=self.random_state) + # Randomly collect a subsample of training samples for pipeline optimization process. if self.subsample < 1.0: - features, _, target, _ = train_test_split(features, target, train_size=self.subsample, random_state=self.random_state) + features, _, target, _ = train_test_split(features, target, train_size=self.subsample, test_size=None, random_state=self.random_state) # Raise a warning message if the training size is less than 1500 when subsample is not default value if features.shape[0] < 1500: print( @@ -687,12 +788,16 @@ def _setup_memory(self): if self.memory == "auto": # Create a temporary folder to store the transformers of the pipeline self._cachedir = mkdtemp() - elif os.path.isdir(self.memory): - self._cachedir = self.memory else: - raise ValueError( - 'Could not find directory for memory caching: {}'.format(self.memory) - ) + if not os.path.isdir(self.memory): + try: + os.makedirs(self.memory) + except: + raise ValueError( + 'Could not create directory for memory caching: {}'.format(self.memory) + ) + self._cachedir = self.memory + self._memory = Memory(cachedir=self._cachedir, verbose=0) elif isinstance(self.memory, Memory): self._memory = self.memory @@ -1064,17 +1169,29 @@ def _check_dataset(self, features, target, sample_weight=None): 'customized config dictionary supports sparse matriies.' ) else: - if np.any(np.isnan(features)): - self._imputed = True + if isinstance(features, np.ndarray): + if np.any(np.isnan(features)): + self._imputed = True + elif isinstance(features, DataFrame): + if features.isnull().values.any(): + self._imputed = True + + if self._imputed: features = self._impute_values(features) try: if target is not None: - X, y = check_X_y(features, target, accept_sparse=True, dtype=np.float64) - return X, y + X, y = check_X_y(features, target, accept_sparse=True, dtype=None) + if self._imputed: + return X, y + else: + return features, target else: - X = check_array(features, order="C", accept_sparse=True, dtype=np.float64) - return X + X = check_array(features, accept_sparse=True, dtype=None) + if self._imputed: + return X + else: + return features except (AssertionError, ValueError): raise ValueError( 'Error: Input data is not in a valid format. Please confirm ' @@ -1221,8 +1338,12 @@ def _evaluate_individuals(self, population, features, target, sample_weight=None result_score_list = self._update_val(val, result_score_list) else: # chunk size for pbar update - # chunk size is min of cpu_count * 2 and n_jobs * 4 - chunk_size = min(cpu_count()*2, self._n_jobs*4) + if self.use_dask: + # chunk size is min of _lambda and n_jobs * 10 + chunk_size = min(self._lambda, self._n_jobs*10) + else: + # chunk size is min of cpu_count * 2 and n_jobs * 4 + chunk_size = min(cpu_count()*2, self._n_jobs*4) for chunk_idx in range(0, len(sklearn_pipeline_list), chunk_size): self._stop_by_max_time_mins() if self.use_dask: @@ -1321,6 +1442,12 @@ def _preprocess_individuals(self, individuals): # Disallow certain combinations of operators because they will take too long or take up too much RAM # This is a fairly hacky way to prevent TPOT from getting stuck on bad pipelines and should be improved in a future release individual_str = str(individual) + if not len(individual): # a pipeline cannot be randomly generated + self.evaluated_individuals_[individual_str] = self._combine_individual_stats(5000., + -float('inf'), + individual.statistics) + self._update_pbar(pbar_msg='Invalid pipeline encountered. Skipping its evaluation.') + continue sklearn_pipeline_str = generate_pipeline_code(expr_to_tree(individual, self._pset), self.operators) if sklearn_pipeline_str.count('PolynomialFeatures') > 1: self.evaluated_individuals_[individual_str] = self._combine_individual_stats(5000., @@ -1450,15 +1577,17 @@ def _random_mutation_operator(self, individual, allow_shrink=True): Returns the individual with one of the mutations applied to it """ - mutation_techniques = [ - partial(gp.mutInsert, pset=self._pset), - partial(mutNodeReplacement, pset=self._pset) - ] - - # We can't shrink pipelines with only one primitive, so we only add it if we find more primitives. - number_of_primitives = sum([isinstance(node, deap.gp.Primitive) for node in individual]) - if number_of_primitives > 1 and allow_shrink: - mutation_techniques.append(partial(gp.mutShrink)) + if self.tree_structure: + mutation_techniques = [ + partial(gp.mutInsert, pset=self._pset), + partial(mutNodeReplacement, pset=self._pset) + ] + # We can't shrink pipelines with only one primitive, so we only add it if we find more primitives. + number_of_primitives = sum([isinstance(node, deap.gp.Primitive) for node in individual]) + if number_of_primitives > 1 and allow_shrink: + mutation_techniques.append(partial(gp.mutShrink)) + else: + mutation_techniques = [partial(mutNodeReplacement, pset=self._pset)] mutator = np.random.choice(mutation_techniques) @@ -1484,7 +1613,7 @@ def _random_mutation_operator(self, individual, allow_shrink=True): # Sometimes you have pipelines for which every shrunk version has already been explored too. # To still mutate the individual, one of the two other mutators should be applied instead. if ((unsuccesful_mutations == 50) and - (type(mutator) is partial and mutator.func is gp.mutShrink)): + (type(mutator) is partial and mutator.func is gp.mutShrink)): offspring, = self._random_mutation_operator(individual, allow_shrink=False) return offspring, @@ -1512,10 +1641,11 @@ def _gen_grow_safe(self, pset, min_, max_, type_=None): def condition(height, depth, type_): """Stop when the depth is equal to height or when a node should be a terminal.""" - return type_ not in [np.ndarray, Output_Array] or depth == height + return type_ not in self.ret_types or depth == height return self._generate(pset, min_, max_, condition, type_) + def _operator_count(self, individual): """Count the number of pipeline operators as a measure of pipeline complexity. diff --git a/tpot/builtins/__init__.py b/tpot/builtins/__init__.py index 7d193fc0..fe6c7f50 100644 --- a/tpot/builtins/__init__.py +++ b/tpot/builtins/__init__.py @@ -28,3 +28,4 @@ from .stacking_estimator import StackingEstimator from .one_hot_encoder import OneHotEncoder, auto_select_categorical_features, _transform_selected from .feature_transformers import CategoricalSelector, ContinuousSelector +from .feature_set_selector import FeatureSetSelector diff --git a/tpot/builtins/feature_set_selector.py b/tpot/builtins/feature_set_selector.py new file mode 100644 index 00000000..428a60ac --- /dev/null +++ b/tpot/builtins/feature_set_selector.py @@ -0,0 +1,150 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +"""This file is part of the TPOT library. + +TPOT was primarily developed at the University of Pennsylvania by: + - Randal S. Olson (rso@randalolson.com) + - Weixuan Fu (weixuanf@upenn.edu) + - Daniel Angell (dpa34@drexel.edu) + - and many more generous open source contributors + +TPOT is free software: you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as +published by the Free Software Foundation, either version 3 of +the License, or (at your option) any later version. + +TPOT is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU Lesser General Public License for more details. + +You should have received a copy of the GNU Lesser General Public +License along with TPOT. If not, see . +""" +import numpy as np +import pandas as pd +import os, os.path +from sklearn.base import BaseEstimator +from sklearn.feature_selection.base import SelectorMixin +from sklearn.utils.validation import check_is_fitted + + +class FeatureSetSelector(BaseEstimator, SelectorMixin): + """Select predefined feature subsets.""" + + @property + def __name__(self): + """Instance name is the same as the class name.""" + return self.__class__.__name__ + + def __init__(self, subset_list, sel_subset): + """Create a FeatureSetSelector object. + + Parameters + ---------- + subset_list: string, required + Path to a file that indicates all the subset lists. Currently, + this file needs to be a .csv with one header row. + There should be 3 columns on the table, including subset names (Subset), + number of features (Size) and features in the subset (Features). + The feature names or indexs of input features + should be seprated by ';' on the 3rd column of the file. + The feature names in the files must match those in the (training and + testing) dataset. + sel_subset: int or string or list or tuple + int: index of subset in subset file + string: subset name of subset + list or tuple: list of int or string for indexs or subset names + Returns + ------- + None + + """ + self.subset_list = subset_list + self.sel_subset = sel_subset + + def fit(self, X, y=None): + """Fit FeatureSetSelector for feature selection + + Parameters + ---------- + X: array-like of shape (n_samples, n_features) + The training input samples. + y: array-like, shape (n_samples,) + The target values (integers that correspond to classes in classification, real numbers in regression). + + Returns + ------- + self: object + Returns a copy of the estimator + """ + subset_df = pd.read_csv(self.subset_list, header=0, index_col=0) + + if isinstance(self.sel_subset, int): + self.sel_subset_name = subset_df.index[self.sel_subset] + elif isinstance(self.sel_subset, str): + self.sel_subset_name = self.sel_subset + else: # list or tuple + self.sel_subset_name = [] + for s in self.sel_subset: + if isinstance(s, int): + self.sel_subset_name.append(subset_df.index[s]) + else: + self.sel_subset_name.append(s) + + + sel_features = subset_df.loc[self.sel_subset_name, 'Features'] + if not isinstance(sel_features, str): + sel_features = ";".join(sel_features.tolist()) + + sel_uniq_features = set(sel_features.split(';')) + + if isinstance(X, pd.DataFrame): # use columns' names + self.feature_names = list(X.columns.values) + self.feat_list = sorted(list(set(sel_uniq_features).intersection(set(self.feature_names)))) + self.feat_list_idx = [list(X.columns).index(feat_name) for feat_name in self.feat_list] + elif isinstance(X, np.ndarray): # use index + self.feature_names = list(range(X.shape[1])) + sel_uniq_features = [int(val) for val in sel_uniq_features] + self.feat_list = sorted(list(set(sel_uniq_features).intersection(set(self.feature_names)))) + self.feat_list_idx = self.feat_list + + if not len(self.feat_list): + raise ValueError('No feature is found on the subset list!') + return self + + def transform(self, X): + """Make subset after fit + + Parameters + ---------- + X: numpy ndarray, {n_samples, n_features} + New data, where n_samples is the number of samples and n_features is the number of features. + + Returns + ------- + X_transformed: array-like, shape (n_samples, n_features + 1) or (n_samples, n_features + 1 + n_classes) for classifier with predict_proba attribute + The transformed feature set. + """ + if isinstance(X, pd.DataFrame): + X_transformed = X[self.feat_list].values + elif isinstance(X, np.ndarray): + X_transformed = X[:, self.feat_list_idx] + + return X_transformed.astype(np.float64) + + def _get_support_mask(self): + """ + Get the boolean mask indicating which features are selected + Returns + ------- + support : boolean array of shape [# input features] + An element is True iff its corresponding feature is selected for + retention. + """ + check_is_fitted(self, 'feat_list_idx') + n_features = len(self.feature_names) + mask = np.zeros(n_features, dtype=bool) + mask[np.asarray(self.feat_list_idx)] = True + + return mask diff --git a/tpot/decorators.py b/tpot/decorators.py index b9ea97bf..f03ac5e7 100644 --- a/tpot/decorators.py +++ b/tpot/decorators.py @@ -26,18 +26,11 @@ from __future__ import print_function from functools import wraps import warnings -from sklearn.datasets import make_classification, make_regression from .export_utils import expr_to_tree, generate_pipeline_code from deap import creator NUM_TESTS = 10 -# generate a small data set for a new pipeline, in order to check if the pipeline -# has unsuppported combinations in params -pretest_X, pretest_y = make_classification(n_samples=50, n_features=10, random_state=42) -pretest_X_reg, pretest_y_reg = make_regression(n_samples=50, n_features=10, random_state=42) - - def _pre_test(func): """Check if the wrapped function works with a pretest data set. @@ -64,33 +57,43 @@ def check_pipeline(self, *args, **kwargs): # clone individual before each func call so it is not altered for # the possible next cycle loop args = [self._toolbox.clone(arg) if isinstance(arg, creator.Individual) else arg for arg in args] - try: - with warnings.catch_warnings(): - warnings.simplefilter('ignore') - - expr = func(self, *args, **kwargs) - # mutation operator returns tuple (ind,); crossover operator - # returns tuple of (ind1, ind2) - expr_tuple = expr if isinstance(expr, tuple) else (expr,) - - for expr_test in expr_tuple: - pipeline_code = generate_pipeline_code( - expr_to_tree(expr_test, self._pset), - self.operators - ) - sklearn_pipeline = eval(pipeline_code, self.operators_context) - - if self.classification: - sklearn_pipeline.fit(pretest_X, pretest_y) - else: - sklearn_pipeline.fit(pretest_X_reg, pretest_y_reg) - bad_pipeline = False + + if func.__name__ == "_generate": + expr = [] + else: + expr = tuple(args) + pass_gen = False + num_test_expr = 0 + # to ensure a pipeline can be generated or mutated. + while not pass_gen and num_test_expr < int(NUM_TESTS/2): + try: + expr = func(self, *args, **kwargs) + pass_gen = True + except: + num_test_expr += 1 + pass + # mutation operator returns tuple (ind,); crossover operator + # returns tuple of (ind1, ind2) + + expr_tuple = expr if isinstance(expr, tuple) else (expr,) + for expr_test in expr_tuple: + pipeline_code = generate_pipeline_code( + expr_to_tree(expr_test, self._pset), + self.operators + ) + sklearn_pipeline = eval(pipeline_code, self.operators_context) + with warnings.catch_warnings(): + warnings.simplefilter('ignore') + sklearn_pipeline.fit(self.pretest_X, self.pretest_y) + + bad_pipeline = False except BaseException as e: - message = '_pre_test decorator: {fname}: num_test={n} {e}'.format( + message = '_pre_test decorator: {fname}: num_test={n} {e}.'.format( n=num_test, fname=func.__name__, e=e + ) # Use the pbar output stream if it's active self._update_pbar(pbar_num=0, pbar_msg=message) @@ -99,4 +102,5 @@ def check_pipeline(self, *args, **kwargs): return expr + return check_pipeline diff --git a/tpot/driver.py b/tpot/driver.py index 8ca54764..bcbbd48f 100755 --- a/tpot/driver.py +++ b/tpot/driver.py @@ -296,7 +296,8 @@ def _get_arg_parser(): help=( 'Number of CPUs for evaluating pipelines in parallel during the ' 'TPOT optimization process. Assigning this to -1 will use as many ' - 'cores as available on the computer.' + 'cores as available on the computer. For n_jobs below -1, ' + '(n_cpus + 1 + n_jobs) are used. Thus for n_jobs = -2, all CPUs but one are used.' ) ) @@ -354,6 +355,25 @@ def _get_arg_parser(): ) ) + parser.add_argument( + '-template', + action='store', + dest='TEMPLATE', + default='RandomTree', + type=str, + help=( + 'Template of predefined pipeline structure. The option is for specifying a desired structure' + 'for the machine learning pipeline evaluated in TPOT. So far this option only supports' + 'linear pipeline structure. Each step in the pipeline should be a main class of operators' + '(Selector, Transformer, Classifier or Regressor) or a specific operator' + '(e.g. SelectPercentile) defined in TPOT operator configuration. If one step is a main class,' + 'TPOT will randomly assign all subclass operators (subclasses of SelectorMixin,' + 'TransformerMixin, ClassifierMixin or RegressorMixin in scikit-learn) to that step.' + 'Steps in the template are delimited by "-", e.g. "SelectPercentile-Transformer-Classifier".' + 'By default value of template is "RandomTree", TPOT generates tree-based pipeline randomly.' + ) + ) + parser.add_argument( '-memory', @@ -523,6 +543,7 @@ def tpot_driver(args): max_eval_time_mins=args.MAX_EVAL_MINS, random_state=args.RANDOM_STATE, config_dict=args.CONFIG_FILE, + template=args.TEMPLATE, memory=args.MEMORY, periodic_checkpoint_folder=args.CHECKPOINT_FOLDER, early_stop=args.EARLY_STOP, diff --git a/tpot/gp_deap.py b/tpot/gp_deap.py index bd3a34fb..f170c4fc 100644 --- a/tpot/gp_deap.py +++ b/tpot/gp_deap.py @@ -231,11 +231,13 @@ def eaMuPlusLambda(population, toolbox, mu, lambda_, cxpb, mutpb, ngen, pbar, # Begin the generational process for gen in range(1, ngen + 1): - - + # after each population save a periodic pipeline + if per_generation_function is not None: + per_generation_function(gen) # Vary the population offspring = varOr(population, toolbox, lambda_, cxpb, mutpb) + # Update generation statistic for all individuals which have invalid 'generation' stats # This hold for individuals that have been altered in the varOr function for ind in population: diff --git a/tpot/gp_types.py b/tpot/gp_types.py index 1d4e8459..d479b653 100644 --- a/tpot/gp_types.py +++ b/tpot/gp_types.py @@ -24,6 +24,5 @@ """ class Output_Array(object): - """Output data type of pipelines.""" - + """Final output data type of pipelines.""" pass diff --git a/tpot/operator_utils.py b/tpot/operator_utils.py index b92b6494..b6061b51 100644 --- a/tpot/operator_utils.py +++ b/tpot/operator_utils.py @@ -25,6 +25,7 @@ import numpy as np from sklearn.base import BaseEstimator, ClassifierMixin, RegressorMixin, TransformerMixin +from sklearn.feature_selection.base import SelectorMixin import inspect @@ -173,18 +174,23 @@ def TPOTOperatorClassFactory(opsourse, opdict, BaseClass=Operator, ArgBaseClass= return None, None else: # define if the operator can be the root of a pipeline - if issubclass(op_obj, ClassifierMixin) or issubclass(op_obj, RegressorMixin): + if issubclass(op_obj, ClassifierMixin): class_profile['root'] = True - optype = "Classifier or Regressor" - else: - optype = "Preprocessor or Selector" + optype = "Classifier" + elif issubclass(op_obj, RegressorMixin): + class_profile['root'] = True + optype = "Regressor" + if issubclass(op_obj, TransformerMixin): + optype = "Transformer" + if issubclass(op_obj, SelectorMixin): + optype = "Selector" @classmethod def op_type(cls): """Return the operator type. Possible values: - "Classifier", "Regressor", "Selector", "Preprocessor" + "Classifier", "Regressor", "Selector", "Transformer" """ return optype @@ -233,7 +239,7 @@ def parameter_types(cls): operator """ - return ([np.ndarray] + arg_types, np.ndarray) + return ([np.ndarray] + arg_types, np.ndarray) # (input types, return types) class_profile['parameter_types'] = parameter_types