In [3]:
import argparse
import logging
import os

from dswizard.components.classification.random_forest import RandomForest
from dswizard.components.data_preprocessing.minmax import MinMaxScalerComponent
from dswizard.core.master import Master
from dswizard.core.model import Dataset
from dswizard.optimizers.bandit_learners.pseudo import PseudoBandit
from dswizard.optimizers.config_generators import SmacGenerator
from dswizard.optimizers.structure_generators.fixed import FixedStructure
from dswizard.optimizers.structure_generators.mcts import TransferLearning, MCTS
from dswizard.util import util

# Maximum optimization time for in seconds
wallclock_limit = 60
#Maximum cutoff time for a single evaluation in seconds
cutoff = 10
# Directory used for logging
log_dir = 'run/'
# OpenML task id
task = 53

util.setup_logging(os.path.join(log_dir, str(task), 'log.txt'))
logger = logging.getLogger()
logging.getLogger('matplotlib').setLevel(logging.WARNING)

# Load dataset
logger.info('Processing task {}'.format(task))
ds, ds_test = Dataset.from_openml(task, 0, 'rocauc')


2021-06-03 12:55:45,967 INFO     root            MainThread Processing task 53
2021-06-03 12:55:45,969 INFO     root            MainThread Starting [get] request for the URL https://www.openml.org/api/v1/xml/task/53
2021-06-03 12:55:45,973 DEBUG    urllib3.connectionpool MainThread Starting new HTTPS connection (1): www.openml.org:443
2021-06-03 12:55:46,213 DEBUG    urllib3.connectionpool MainThread https://www.openml.org:443 "GET /api/v1/xml/task/53 HTTP/1.1" 200 615
2021-06-03 12:55:46,220 INFO     root            MainThread 0.2504499s taken for [get] request for the URL https://www.openml.org/api/v1/xml/task/53
2021-06-03 12:55:46,227 INFO     root            MainThread Starting [get] request for the URL https://www.openml.org/api/v1/xml/data/54
2021-06-03 12:55:46,234 DEBUG    urllib3.connectionpool MainThread Starting new HTTPS connection (1): www.openml.org:443
2021-06-03 12:55:46,449 DEBUG    urllib3.connectionpool MainThread https://www.openml.org:443 "GET /api/v1/xml/data/54 

In [7]:
master = Master(
    ds=ds,
    working_directory=os.path.join(log_dir, str(task)),
    n_workers=1,
    model='../dswizard/dswizard/assets/rf_complete.pkl',

    wallclock_limit=wallclock_limit,
    cutoff=cutoff,
    pre_sample=False,

    config_generator_class=SmacGenerator,

    structure_generator_class=MCTS,
    structure_generator_kwargs={'policy': TransferLearning},

    bandit_learner_class=PseudoBandit
)

pipeline, run_history, ensemble = master.optimize()

2021-06-03 12:57:01,495 INFO     Structure       Thread-8   Loading transfer model from ../dswizard/dswizard/assets/rf_complete.pkl
2021-06-03 12:57:02,367 INFO     Worker          MainThread Running on marc-P50 with pid 91377
2021-06-03 12:57:02,369 INFO     Master          MainThread starting run at 2021-06-03 12:57:02. Configuration:
	wallclock_limit: 60
	cutoff: 10
	pre_sample: False
2021-06-03 12:57:02,370 DEBUG    Dispatcher      MainThread Waiting for all workers to finish current work. 0 / 1 busy...
2021-06-03 12:57:02,371 INFO     Master          MainThread Starting repetition 0
2021-06-03 12:57:02,372 DEBUG    Dispatcher      MainThread Processing job (0, 0, None)
2021-06-03 12:57:02,378 DEBUG    Structure       MainProcess 	Selecting ROOT
2021-06-03 12:57:02,401 DEBUG    Structure       MainProcess 	Expanding with dswizard.components.classification.multinomial_nb.MultinomialNB. Option 1/35
2021-06-03 12:57:02,462 INFO     Worker          MainProcess start transforming job (0

In [10]:
# Analysis
_, incumbent = run_history.get_incumbent()

logging.info(f'Best found configuration: {incumbent.steps}\n'
             f'{incumbent.get_incumbent().config} with loss {incumbent.get_incumbent().loss}')
logging.info(f'A total of {len(run_history.data)} unique structures where sampled.')
logging.info(f'A total of {len(run_history.get_all_runs())} runs where executed.')

y_pred = pipeline.predict(ds_test.X)
y_prob = pipeline.predict_proba(ds_test.X)

logging.info(f'Final pipeline:\n{pipeline}')
logging.info(f'Final test performance {util.score(ds_test.y, y_prob, y_pred, ds.metric)}')
logging.info(f'Final ensemble performance '
             f'{util.score(ds_test.y, ensemble.predict_proba(ds_test.X), ensemble.predict(ds_test.X), ds.metric)} '
             f'based on {len(ensemble.estimators_)} individuals')

2021-06-03 12:59:40,046 INFO     root            MainThread Best found configuration: [('3', FastICAComponent), ('4', RandomForest)]
Configuration:
  3:algorithm, Value: 'parallel'
  3:fun, Value: 'logcosh'
  3:n_components_factor, Value: 1.0
  3:whiten, Value: True
  4:bootstrap, Value: True
  4:criterion, Value: 'gini'
  4:max_features, Value: 0.5
  4:min_impurity_decrease, Constant: 0.0
  4:min_samples_leaf, Value: 1
  4:min_samples_split, Value: 2
  4:min_weight_fraction_leaf, Constant: 0.0
  4:n_estimators, Constant: 512
 with loss -0.9442383277716109
2021-06-03 12:59:40,047 INFO     root            MainThread A total of 7 unique structures where sampled.
2021-06-03 12:59:40,048 INFO     root            MainThread A total of 20 runs where executed.
2021-06-03 12:59:40,154 INFO     root            MainThread Final pipeline:
FlexiblePipeline(configuration={'3:algorithm': 'parallel', '3:fun': 'logcosh',
                                '3:n_components_factor': 1.0, '3:whiten': True,
 

In [12]:
run_history

<dswizard.core.runhistory.RunHistory at 0x7f3e62090be0>