# Prepare train_data and test_data

In [1]:
import pandas as pd
from sklearn import datasets
from sklearn.model_selection import train_test_split
from hypergbm import make_experiment
from hypernets.tabular.metrics import metric_to_scoring

In [2]:
X,y = datasets.load_breast_cancer(as_frame=True,return_X_y=True)
X_train,X_test,y_train,y_test = train_test_split(X,y,train_size=0.7,random_state=335)
train_data = pd.concat([X_train,y_train],axis=1)

In [3]:
experiment = make_experiment(train_data.copy(), target='target', 
                             reward_metric='precision', pos_label=1,
                             random_state=1234, max_trials=20)
estimator = experiment.run()

In [4]:
scorer = metric_to_scoring('precision',pos_label=1)
score = scorer(estimator, X_test, y_test)
score

0.9568965517241379

# Use Feature selection
Set `feature_selection=True` to enable feature selection with feature_importance before HyperGBM search, optional settings:
* feature_selection_strategy: one of *threshold*, *number* or *quantile*. default is *threshold*
* feature_selection_threshold: confidence threshold of feature_importance  
* feature_selection_quantile: confidence quantile of feature_importance  
* feature_selection_number: expected feature number to keep

In [5]:
#feature_selection_number = 15
#feature_selection_quantile = 0.1
#feature_selection_threshold = 0.1
experiment = make_experiment(train_data.copy(), target='target', 
                             reward_metric='precision', pos_label=1,
                             random_state=1234, max_trials=20,
                             feature_selection=True,
                             feature_selection_threshold=0.0001,
                             )
estimator = experiment.run()

In [6]:
estimator.steps

[('data_clean',
  DataCleanStep(cv=True,
                data_cleaner_args={'correct_object_dtype': True,
                                   'drop_columns': None,
                                   'drop_constant_columns': True,
                                   'drop_duplicated_columns': False,
                                   'drop_idness_columns': True,
                                   'drop_label_nan_rows': True,
                                   'int_convert_to': 'float', 'nan_chars': None,
                                   'reduce_mem_usage': False,
                                   'reserve_columns': None},
                name='data_clean')),
 ('feature_selection',
  FeatureImportanceSelectionStep(name='feature_selection', number=None,
                                 quantile=None, strategy='threshold',
                                 threshold=0.0001)),
 ('estimator',
  GreedyEnsemble(weight=[0.2, 0.0, 0.0, 0.0, 0.0, 0.05, 0.0, 0.0, 0.75, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0

In [7]:
scorer = metric_to_scoring('precision', pos_label=1)
score = scorer(estimator, X_test, y_test)
score

0.956140350877193