In [1]:
import numpy as np
import pandas as pd
import pathlib

import os
os.chdir('..')

import warnings
warnings.simplefilter('ignore')

from fp.traindata_samplers import CompleteData
from fp.missingvalue_handlers import CompleteCaseAnalysis
from fp.dataset_experiments import GermanCreditDatasetSexExperiment
from fp.scalers import NamedStandardScaler
from fp.learners import LogisticRegression, DecisionTree
from fp.post_processors import NoPostProcessing, RejectOptionPostProcessing
from fp.pre_processors import NoPreProcessing, DIRemover


import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# creating list of parameters that we will alter to observe variations
seeds = [0xbeef, 0xcafe, 0xdead]
learners = [LogisticRegression()]

processors = [(NoPreProcessing(), NoPostProcessing()), (DIRemover(1.0), NoPostProcessing()), (NoPreProcessing(), RejectOptionPostProcessing())]


# specify the strategy to filter the optimal results on validation set from alll the settings of processors above.
# E.g. if a list ['accuracy', 'selection_rate', 'false_discovery_rate'] is specified, the optimal result is the setting with highest accuracy, selection rate and false discovery rate. If two settings have the same accuracy, the one with highest selection rate is the optimal one, etc. The input list specifies a skyline order to select the optimal one.
# E.g. if a dict {'accuracy': 0.5, 'selection_rate': 0.3, 'false_discovery_rate': 0.2} is specified, the optimal result is the setting with highest values from formula accuracy*0.5+selection_rate*0.3+false_discovery_rate*0.2.
# If more than one settings have the highest value by the above strategies, then all of the settings are returned as optimal.
filter_res_on_val_by_order = ['accuracy', 'selection_rate', 'false_discovery_rate']
filter_res_on_val_by_weight_sum = {'accuracy': 0.5, 'selection_rate': 0.3, 'false_discovery_rate': 0.2}

In [3]:
def calculate_metrics(seed, learners, pre_processors, post_processors, filter_val_strategy):
    '''
        Experiment function to run the experiments with multiple combinations of learners and processors in the input
    '''
    exp = GermanCreditDatasetSexExperiment(
        fixed_random_seed=seed,
        train_data_sampler=CompleteData(),
        missing_value_handler=CompleteCaseAnalysis(),
        numeric_attribute_scaler=NamedStandardScaler(),
        learners=learners,
        pre_processors=pre_processors,
        post_processors=post_processors,
        optimal_validation_strategy=filter_val_strategy)
    exp.run()
    return exp.generate_file_path()

def run_exp(seeds, learners, processors, filter_val_strategy):
    '''
        This is the main driver function that calls the calculate_metrics to give metrices on combinations of various learners, pre and post processing techniques.
    '''
    skyline_res_folder = {}
    for seed in seeds:
        input_preprocessors = [x[0] for x in processors]
        input_postprocessors = [x[1] for x in processors]
        skyline_res_folder[seed] = calculate_metrics(seed, learners, input_preprocessors, input_postprocessors, filter_val_strategy)
    return skyline_res_folder

In [4]:
# running experiments using above parameters
filter_order_results = run_exp(seeds, learners, processors, filter_res_on_val_by_order)
print (filter_order_results)

complete_case removed 0 instances from training data
Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:    1.8s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


complete_case removed 0 instances from validation data
Injecting zero columns for features not present set()
complete_case removed 0 instances from validation data
Injecting zero columns for features not present set()
Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:    0.4s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


complete_case removed 0 instances from validation data
Injecting zero columns for features not present set()
complete_case removed 0 instances from validation data
Injecting zero columns for features not present set()
Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Done  53 out of  60 | elapsed:    0.3s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:    0.3s finished


complete_case removed 0 instances from validation data
Injecting zero columns for features not present set()
complete_case removed 0 instances from validation data
Injecting zero columns for features not present set()
Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:    0.3s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


complete_case removed 0 instances from validation data
Injecting zero columns for features not present set()
complete_case removed 0 instances from validation data
Injecting zero columns for features not present set()
Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Done  53 out of  60 | elapsed:    0.3s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:    0.3s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


complete_case removed 0 instances from validation data
Injecting zero columns for features not present set()
complete_case removed 0 instances from validation data
Injecting zero columns for features not present set()
Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:    0.3s finished


complete_case removed 0 instances from validation data
Injecting zero columns for features not present set()
complete_case removed 0 instances from validation data
Injecting zero columns for features not present set()
Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:    0.3s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


complete_case removed 0 instances from validation data
Injecting zero columns for features not present set()
complete_case removed 0 instances from validation data
Injecting zero columns for features not present set()
Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Done  53 out of  60 | elapsed:    0.3s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:    0.3s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


complete_case removed 0 instances from validation data
Injecting zero columns for features not present set()
complete_case removed 0 instances from validation data
Injecting zero columns for features not present set()
Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:    0.3s finished


complete_case removed 0 instances from validation data
Injecting zero columns for features not present set()
complete_case removed 0 instances from validation data
Injecting zero columns for features not present set()
complete_case removed 0 instances from training data
Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  53 out of  60 | elapsed:    0.5s remaining:    0.1s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:    0.5s finished


complete_case removed 0 instances from validation data
Injecting zero columns for features not present set()
complete_case removed 0 instances from validation data
Injecting zero columns for features not present set()
Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  53 out of  60 | elapsed:    0.4s remaining:    0.1s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:    0.5s finished


complete_case removed 0 instances from validation data
Injecting zero columns for features not present set()
complete_case removed 0 instances from validation data
Injecting zero columns for features not present set()
Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:    0.5s finished


complete_case removed 0 instances from validation data
Injecting zero columns for features not present set()
complete_case removed 0 instances from validation data
Injecting zero columns for features not present set()
Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  53 out of  60 | elapsed:    0.4s remaining:    0.1s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:    0.4s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


complete_case removed 0 instances from validation data
Injecting zero columns for features not present set()
complete_case removed 0 instances from validation data
Injecting zero columns for features not present set()
Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:    0.3s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


complete_case removed 0 instances from validation data
Injecting zero columns for features not present set()
complete_case removed 0 instances from validation data
Injecting zero columns for features not present set()
Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Done  53 out of  60 | elapsed:    0.3s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:    0.4s finished


complete_case removed 0 instances from validation data
Injecting zero columns for features not present set()
complete_case removed 0 instances from validation data
Injecting zero columns for features not present set()
Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  53 out of  60 | elapsed:    0.4s remaining:    0.1s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:    0.4s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


complete_case removed 0 instances from validation data
Injecting zero columns for features not present set()
complete_case removed 0 instances from validation data
Injecting zero columns for features not present set()
Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Done  53 out of  60 | elapsed:    0.3s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:    0.3s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


complete_case removed 0 instances from validation data
Injecting zero columns for features not present set()
complete_case removed 0 instances from validation data
Injecting zero columns for features not present set()
Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Done  53 out of  60 | elapsed:    0.3s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:    0.4s finished


complete_case removed 0 instances from validation data
Injecting zero columns for features not present set()
complete_case removed 0 instances from validation data
Injecting zero columns for features not present set()
complete_case removed 0 instances from training data
Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  53 out of  60 | elapsed:    0.3s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:    0.3s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


complete_case removed 0 instances from validation data
Injecting zero columns for features not present set()
complete_case removed 0 instances from validation data
Injecting zero columns for features not present set()
Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Done  53 out of  60 | elapsed:    0.3s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:    0.3s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


complete_case removed 0 instances from validation data
Injecting zero columns for features not present set()
complete_case removed 0 instances from validation data
Injecting zero columns for features not present set()
Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:    0.4s finished


complete_case removed 0 instances from validation data
Injecting zero columns for features not present set()
complete_case removed 0 instances from validation data
Injecting zero columns for features not present set()
Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  53 out of  60 | elapsed:    0.3s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:    0.3s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


complete_case removed 0 instances from validation data
Injecting zero columns for features not present set()
complete_case removed 0 instances from validation data
Injecting zero columns for features not present set()
Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Done  53 out of  60 | elapsed:    0.3s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:    0.3s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


complete_case removed 0 instances from validation data
Injecting zero columns for features not present set()
complete_case removed 0 instances from validation data
Injecting zero columns for features not present set()
Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Done  53 out of  60 | elapsed:    0.3s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:    0.3s finished


complete_case removed 0 instances from validation data
Injecting zero columns for features not present set()
complete_case removed 0 instances from validation data
Injecting zero columns for features not present set()
Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:    0.4s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


complete_case removed 0 instances from validation data
Injecting zero columns for features not present set()
complete_case removed 0 instances from validation data
Injecting zero columns for features not present set()
Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:    0.3s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


complete_case removed 0 instances from validation data
Injecting zero columns for features not present set()
complete_case removed 0 instances from validation data
Injecting zero columns for features not present set()
Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:    0.3s finished


complete_case removed 0 instances from validation data
Injecting zero columns for features not present set()
complete_case removed 0 instances from validation data
Injecting zero columns for features not present set()
{48879: 'logs/2020-08-31_10-59-54-994_germancreditsex/', 51966: 'logs/2020-08-31_11-00-05-290_germancreditsex/', 57005: 'logs/2020-08-31_11-00-15-865_germancreditsex/'}


In [5]:
filter_formula_results = run_exp(seeds, learners, processors, filter_res_on_val_by_weight_sum)
print (filter_formula_results)

complete_case removed 0 instances from training data
Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:    0.3s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


complete_case removed 0 instances from validation data
Injecting zero columns for features not present set()
complete_case removed 0 instances from validation data
Injecting zero columns for features not present set()
Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Done  53 out of  60 | elapsed:    0.3s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:    0.3s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


complete_case removed 0 instances from validation data
Injecting zero columns for features not present set()
complete_case removed 0 instances from validation data
Injecting zero columns for features not present set()
Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:    0.4s finished


complete_case removed 0 instances from validation data
Injecting zero columns for features not present set()
complete_case removed 0 instances from validation data
Injecting zero columns for features not present set()
Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  53 out of  60 | elapsed:    0.3s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:    0.3s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


complete_case removed 0 instances from validation data
Injecting zero columns for features not present set()
complete_case removed 0 instances from validation data
Injecting zero columns for features not present set()
Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:    0.3s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


complete_case removed 0 instances from validation data
Injecting zero columns for features not present set()
complete_case removed 0 instances from validation data
Injecting zero columns for features not present set()
Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:    0.3s finished


complete_case removed 0 instances from validation data
Injecting zero columns for features not present set()
complete_case removed 0 instances from validation data
Injecting zero columns for features not present set()
Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:    0.5s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


complete_case removed 0 instances from validation data
Injecting zero columns for features not present set()
complete_case removed 0 instances from validation data
Injecting zero columns for features not present set()
Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Done  53 out of  60 | elapsed:    0.3s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:    0.3s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


complete_case removed 0 instances from validation data
Injecting zero columns for features not present set()
complete_case removed 0 instances from validation data
Injecting zero columns for features not present set()
Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:    0.3s finished


complete_case removed 0 instances from validation data
Injecting zero columns for features not present set()
complete_case removed 0 instances from validation data
Injecting zero columns for features not present set()
complete_case removed 0 instances from training data
Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:    0.5s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


complete_case removed 0 instances from validation data
Injecting zero columns for features not present set()
complete_case removed 0 instances from validation data
Injecting zero columns for features not present set()
Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:    0.3s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


complete_case removed 0 instances from validation data
Injecting zero columns for features not present set()
complete_case removed 0 instances from validation data
Injecting zero columns for features not present set()
Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:    0.3s finished


complete_case removed 0 instances from validation data
Injecting zero columns for features not present set()
complete_case removed 0 instances from validation data
Injecting zero columns for features not present set()
Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  53 out of  60 | elapsed:    0.3s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:    0.3s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


complete_case removed 0 instances from validation data
Injecting zero columns for features not present set()
complete_case removed 0 instances from validation data
Injecting zero columns for features not present set()
Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:    0.3s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


complete_case removed 0 instances from validation data
Injecting zero columns for features not present set()
complete_case removed 0 instances from validation data
Injecting zero columns for features not present set()
Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:    0.3s finished


complete_case removed 0 instances from validation data
Injecting zero columns for features not present set()
complete_case removed 0 instances from validation data
Injecting zero columns for features not present set()
Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  53 out of  60 | elapsed:    0.3s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:    0.3s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


complete_case removed 0 instances from validation data
Injecting zero columns for features not present set()
complete_case removed 0 instances from validation data
Injecting zero columns for features not present set()
Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Done  53 out of  60 | elapsed:    0.3s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:    0.3s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


complete_case removed 0 instances from validation data
Injecting zero columns for features not present set()
complete_case removed 0 instances from validation data
Injecting zero columns for features not present set()
Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Done  53 out of  60 | elapsed:    0.3s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:    0.4s finished


complete_case removed 0 instances from validation data
Injecting zero columns for features not present set()
complete_case removed 0 instances from validation data
Injecting zero columns for features not present set()
complete_case removed 0 instances from training data
Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  53 out of  60 | elapsed:    0.3s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:    0.3s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


complete_case removed 0 instances from validation data
Injecting zero columns for features not present set()
complete_case removed 0 instances from validation data
Injecting zero columns for features not present set()
Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Done  53 out of  60 | elapsed:    0.3s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:    0.3s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


complete_case removed 0 instances from validation data
Injecting zero columns for features not present set()
complete_case removed 0 instances from validation data
Injecting zero columns for features not present set()
Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Done  53 out of  60 | elapsed:    0.3s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:    0.3s finished


complete_case removed 0 instances from validation data
Injecting zero columns for features not present set()
complete_case removed 0 instances from validation data
Injecting zero columns for features not present set()
Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  53 out of  60 | elapsed:    0.3s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:    0.3s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


complete_case removed 0 instances from validation data
Injecting zero columns for features not present set()
complete_case removed 0 instances from validation data
Injecting zero columns for features not present set()
Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:    0.3s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


complete_case removed 0 instances from validation data
Injecting zero columns for features not present set()
complete_case removed 0 instances from validation data
Injecting zero columns for features not present set()
Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Done  53 out of  60 | elapsed:    0.3s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:    0.4s finished


complete_case removed 0 instances from validation data
Injecting zero columns for features not present set()
complete_case removed 0 instances from validation data
Injecting zero columns for features not present set()
Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  53 out of  60 | elapsed:    0.4s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:    0.4s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


complete_case removed 0 instances from validation data
Injecting zero columns for features not present set()
complete_case removed 0 instances from validation data
Injecting zero columns for features not present set()
Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Done  53 out of  60 | elapsed:    0.3s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:    0.3s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


complete_case removed 0 instances from validation data
Injecting zero columns for features not present set()
complete_case removed 0 instances from validation data
Injecting zero columns for features not present set()
Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Done  53 out of  60 | elapsed:    0.3s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:    0.4s finished


complete_case removed 0 instances from validation data
Injecting zero columns for features not present set()
complete_case removed 0 instances from validation data
Injecting zero columns for features not present set()
{48879: 'logs/2020-08-31_11-00-24-840_germancreditsex/', 51966: 'logs/2020-08-31_11-00-33-280_germancreditsex/', 57005: 'logs/2020-08-31_11-00-41-993_germancreditsex/'}


## Visualize the result of skyline selection for a single trial

In [6]:
def get_skyline_candidates(seed_path_map, focus_seed):
    '''
        Prepare the skyline candidates data for visualization
    '''
    setting_labels = {'reject_option': 'RO', 'diremover-1.0': 'DI1.0',
                      'no_pre_processing': 'NoPre', 'no_post_processing': 'NoPost',
                      'DecisionTree': 'DT', 'LogisticRegression': 'LR'}
    skyline_df = pd.read_csv(seed_path_map[focus_seed] + "skyline_options.csv")
    # only keep the name of preprocessor (idx 1), learner (idx 5), and postprocessor (idx 6) in the settings for visualization purpose
    skyline_df['setting'] = skyline_df['setting'].apply(lambda x: "__".join([x.split('__')[i] for i in range(len(x.split('__'))) if i in [1, 5, 6]]))
    # remove the seed name in the setting for visualization purpose
    skyline_df['setting'] = skyline_df['setting'].apply(lambda x: x.replace('-' + str(focus_seed), ''))
    # rename (shorten) the settings' names for visualization purpose
    skyline_df['setting'] = skyline_df['setting'].apply(lambda x: '_'.join([setting_labels[stepi] for stepi in x.split('__')]))


    # show the candidates using only one fairness intervention method
    return skyline_df[skyline_df['setting'].apply(lambda x: 'NoP' in x)]


In [7]:
# read the skyline options for current seed
focus_seed = 0xdead
filter_order_options = get_skyline_candidates(filter_order_results, focus_seed)
filter_order_options.head(5)

Unnamed: 0,setting,data,accuracy,selection_rate,false_discovery_rate,skyline,optimal
0,NoPre_LR_RO,val,0.71,0.89,-0.280899,79.619101,1
1,NoPre_LR_RO,test,0.771144,0.800995,-0.173913,84.950465,1
2,NoPre_LR_NoPost,val,0.71,0.89,-0.280899,79.619101,1
3,NoPre_LR_NoPost,test,0.756219,0.925373,-0.225806,84.649815,1
6,DI1.0_LR_NoPost,val,0.7,0.88,-0.284091,78.515909,0


In [8]:
filter_formula_options = get_skyline_candidates(filter_formula_results, focus_seed)
filter_formula_options.head(5)

Unnamed: 0,setting,data,accuracy,selection_rate,false_discovery_rate,skyline,optimal
0,NoPre_LR_RO,val,0.71,0.89,-0.280899,0.56582,1
1,NoPre_LR_RO,test,0.771144,0.800995,-0.173913,0.591088,1
2,NoPre_LR_NoPost,val,0.71,0.89,-0.280899,0.56582,1
3,NoPre_LR_NoPost,test,0.756219,0.925373,-0.225806,0.61056,1
6,DI1.0_LR_NoPost,val,0.7,0.88,-0.284091,0.557182,0


In [9]:
def output_scatter_plot(f_name, df, x_col, y_col, hue_col='setting', color_p='Set2'):
    '''
        Visualization of the skyline options w.r.t. two metrics (X and Y axis) in the skyline inputs.
    '''
    sns.set(style='whitegrid', font_scale=1.5)
    # add jitters for x to account for ties in the values
    data = df.copy()
    noise_param = 100
    data[x_col] += np.random.random(data.shape[0]) / noise_param - 1 / noise_param / 2

    fig, (ax1, ax2) = plt.subplots(1, 2, sharex=True, sharey=True, figsize=(13, 6))
    
    sns.scatterplot(x_col, y_col, hue_col, data=data.query("data == 'val'"), ax=ax1, style='optimal', s=100)
    sns.scatterplot(x_col, y_col, hue_col, data=data.query("data == 'test'"), ax=ax2, style='optimal', s=100)
    ax1.set_title('validation')
    ax2.set_title('test')
    plt.tight_layout()

    # save plot into the disc
    cur_f_path = f_name[0:f_name.rfind("/") + 1]
    if not os.path.exists(cur_f_path):
        directory = os.path.dirname(cur_f_path)
        pathlib.Path(directory).mkdir(parents=True, exist_ok=True)
    plt.savefig(f_name + '.png')

In [10]:
filter_order_options

Unnamed: 0,setting,data,accuracy,selection_rate,false_discovery_rate,skyline,optimal
0,NoPre_LR_RO,val,0.71,0.89,-0.280899,79.619101,1
1,NoPre_LR_RO,test,0.771144,0.800995,-0.173913,84.950465,1
2,NoPre_LR_NoPost,val,0.71,0.89,-0.280899,79.619101,1
3,NoPre_LR_NoPost,test,0.756219,0.925373,-0.225806,84.649815,1
6,DI1.0_LR_NoPost,val,0.7,0.88,-0.284091,78.515909,0
7,DI1.0_LR_NoPost,test,0.781095,0.890547,-0.201117,86.813808,0


In [11]:
filter_formula_options

Unnamed: 0,setting,data,accuracy,selection_rate,false_discovery_rate,skyline,optimal
0,NoPre_LR_RO,val,0.71,0.89,-0.280899,0.56582,1
1,NoPre_LR_RO,test,0.771144,0.800995,-0.173913,0.591088,1
2,NoPre_LR_NoPost,val,0.71,0.89,-0.280899,0.56582,1
3,NoPre_LR_NoPost,test,0.756219,0.925373,-0.225806,0.61056,1
6,DI1.0_LR_NoPost,val,0.7,0.88,-0.284091,0.557182,0
7,DI1.0_LR_NoPost,test,0.781095,0.890547,-0.201117,0.617488,0


In [12]:
x_col = 'selection_rate'
y_col = 'accuracy'
output_fname = "_".join(['examples/skyline_plots/Order', 's'+str(focus_seed), x_col, y_col])

output_scatter_plot(output_fname, filter_order_options, x_col, y_col)

In [13]:
x_col = 'selection_rate'
y_col = 'accuracy'
output_fname = "_".join(['examples/skyline_plots/Formula', 's'+str(focus_seed), x_col, y_col])

output_scatter_plot(output_fname, filter_formula_options, x_col, y_col)