## TPOT

In [1]:
import pandas as pd
import numpy as np  
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# Load the data
data_train = pd.read_csv('../data/ml/hotel_reservations_train.csv')
data_test = pd.read_csv('../data/ml/hotel_reservations_test.csv')

X_train = data_train.drop('booking_status', axis=1)
y_train = data_train['booking_status']

X_test = data_test.drop('booking_status', axis=1)
y_test = data_test['booking_status']

In [3]:
from tpot import TPOTClassifier

tpot = TPOTClassifier(
    generations=10,                                             # Number of iterations to the run pipeline optimization process
    population_size=100,                                        # Number of individuals to retain in the genetic programming population every generation
    offspring_size=50,                                          # Number of offspring to produce in each genetic programming generation
    mutation_rate=0.9,                                          # Mutation rate for the genetic programming algorithm in the range
    crossover_rate=0.1,                                         # Crossover rate for the genetic programming algorithm in the range
    scoring='accuracy',                                         # Function used to evaluate the quality of a given pipeline for the classification problem
    cv=5,                                                       # Cross-validation strategy used when evaluating pipelines
    subsample=0.8,                                              # Fraction of training samples that are used during the TPOT optimization process
    n_jobs=-1,                                                  # Number of processes to use for evaluating pipelines in parallel during the TPOT optimization process
    max_time_mins=60,                                           # How many minutes TPOT has to optimize the pipeline
    max_eval_time_mins=5,                                       # How many minutes TPOT has to evaluate a single pipeline
    random_state=42,                                            # The seed of the pseudo-random number generator used in TPOT
    config_dict=None,                                           # A configuration dictionary for customizing TPOT’s operators and parameters
    template=None,                                              # A pipeline template to begin the optimization process with
    warm_start=False,                                           # Flag indicating if TPOT should reuse the population from previous calls to fit()
    memory=None,                                                # A caching mechanism to use with TPOT
    use_dask=False,                                             # Flag indicating if TPOT should use Dask for evaluation of the pipelines
    periodic_checkpoint_folder='../.internal/checkpoints',      # Path to a directory where TPOT will save pipelines during the optimization process
    early_stop=None,                                            # How many generations TPOT checks for early stopping. None means no early stopping
    verbosity=2,                                                # How much information TPOT communicates while it’s running
    log_file='../.internal/tpot.log',                           # Log file to write to
    disable_update_check=False                                  # Flag indicating if TPOT should check for updates to the TPOT package
)

In [None]:
# Fit the TPOT classifier
tpot.fit(X_train, y_train)