In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import os
import sys

project_root = os.path.abspath(os.path.join(os.getcwd(), ".."))
if project_root not in sys.path:
    sys.path.insert(0, project_root)

# prepare

In [None]:
import os

from PIPE_X.pipeline import pickle_pipeline

CONFIG_BASE_PATH = '../experiments/yaml_files'
DATA_BASE_PATH = '../data'
PIPELINE_BASE_PATH = '../pipelines/pickles'

MODEL_NAMES = [
    'LR',
    'DT',
    'GBT',
    'KNN',
    'HGB',
    'NN',
    'SVC',
]

MODELS_NAN = [  # restrict to models able to handle NaN
    'DT',
    'GBT',
    'HGB'
]

DATASETS = [
    ('adult', './data/adult/adult.train.csv'),
    ('adult_noisy', './data/adult_noisy/adult_0.000000.train.csv'),
    ('adult_numerical', './data/adult_numerical/adult_numerical.train.csv'),
    ('bank', './data/bank/bank.csv'),
    ('compas', './data/compas/compas.csv'),
    ('german', './data/german/german.csv'),
    ('phishing', './data/phishing/phishing.csv'),
    ('titanic', './data/titanic/titanic.csv'),
]

METRICS = [
    'immediate_impact',
    'leave_out_impact',
]

# fair preprocessing: Adult dataset


In [None]:
from PIPE_X.utilities import generate_yaml
from data.adult.setup import adult_noisy_dataset

from pipelines.fpp_pipelines_pickle import get_ac1_pipeline, get_ac2_pipeline, get_ac3_pipeline, get_ac4_pipeline, \
    get_ac5_pipeline, get_ac6_pipeline, get_ac7_pipeline, get_ac8_pipeline, get_ac9_pipeline, get_ac10_pipeline

adult_noisy_dataset(probabilities=[0.0], path=DATA_BASE_PATH)

pipelines = [
    (get_ac1_pipeline, 'adult_AC01_Imblearn.pkl'),
    (get_ac2_pipeline, 'adult_AC02_Imblearn.pkl'),
    (get_ac3_pipeline, 'adult_AC03_Imblearn.pkl'),
    (get_ac4_pipeline, 'adult_AC04_Imblearn.pkl'),
    (get_ac5_pipeline, 'adult_AC05_Imblearn.pkl'),
    (get_ac6_pipeline, 'adult_AC06_Imblearn.pkl'),
    (get_ac7_pipeline, 'adult_AC07_Imblearn.pkl'),
    (get_ac8_pipeline, 'adult_AC08_Imblearn.pkl'),
    (get_ac9_pipeline, 'adult_AC09_Imblearn.pkl'),
    (get_ac10_pipeline, 'adult_AC10_Imblearn.pkl'),
]

for generator, name in pipelines:
    pickle_pipeline(generator(), os.path.join(PIPELINE_BASE_PATH, 'fpp'), name)

experiments = [
    ('AC01', 'adult_AC01_Imblearn.pkl', ['categorical_encoder', 'fallback_encoder']),
    ('AC02', 'adult_AC02_Imblearn.pkl', ['categorical_encoder', 'fallback_encoder']),
    ('AC03', 'adult_AC03_Imblearn.pkl', ['fallback_encoder']),
    ('AC04', 'adult_AC04_Imblearn.pkl', ['categorical_encoder', 'fallback_encoder']),
    ('AC05', 'adult_AC05_Imblearn.pkl', ['categorical_encoder', 'fallback_encoder']),
    ('AC06', 'adult_AC06_Imblearn.pkl', ['fallback_encoder']),
    ('AC07', 'adult_AC07_Imblearn.pkl', ['categorical_encoder', 'fallback_encoder']),
    ('AC08', 'adult_AC08_Imblearn.pkl', ['fallback_encoder']),
    ('AC09', 'adult_AC09_Imblearn.pkl', ['fallback_encoder']),
    ('AC10', 'adult_AC10_Imblearn.pkl', ['fallback_encoder']),
]

experiment_list = [
    {
        'name': name,
        'datasets': ['adult_noisy'],
        'pipeline': os.path.join(PIPELINE_BASE_PATH[1:], 'fpp', filename),
        'essential_steps': steps,
    }
    for name, filename, steps in experiments
]

generate_yaml(
        MODELS_NAN,
        DATASETS,
        METRICS,
        CONFIG_BASE_PATH,
        'exp_FPP_adult.yaml',
        experiment_list
)

In [None]:
%cd ..
# %run -i ./experiment_runner.py --config ./experiments/yaml_files/exp_FPP_adult.yaml --output ./experiments/ --sample_size 10 --same_samples_across_experiments --replace
%cd notebooks

# fair preprocessing: Bank dataset


In [None]:
from PIPE_X.utilities import generate_yaml
from data.bank.setup import fetch_bank_additional_data

from pipelines.fpp_pipelines_pickle import get_bm1_pipeline, get_bm2_pipeline, get_bm3_pipeline, get_bm4_pipeline, \
    get_bm5_pipeline, get_bm6_pipeline, get_bm7_pipeline, get_bm8_pipeline

fetch_bank_additional_data()

pipelines = [
    (get_bm1_pipeline, 'bank_BM01_Imblearn.pkl'),
    (get_bm2_pipeline, 'bank_BM02_Imblearn.pkl'),
    (get_bm3_pipeline, 'bank_BM03_Imblearn.pkl'),
    (get_bm4_pipeline, 'bank_BM04_Imblearn.pkl'),
    (get_bm5_pipeline, 'bank_BM05_Imblearn.pkl'),
    (get_bm6_pipeline, 'bank_BM06_Imblearn.pkl'),
    (get_bm7_pipeline, 'bank_BM07_Imblearn.pkl'),
    (get_bm8_pipeline, 'bank_BM08_Imblearn.pkl'),
]

for generator, name in pipelines:
    pickle_pipeline(generator(), os.path.join(PIPELINE_BASE_PATH, 'fpp'), name)

experiments = [
    ('BM01', 'bank_BM01_Imblearn.pkl', ['categorical_encoder', 'fallback_encoder']),
    ('BM02', 'bank_BM02_Imblearn.pkl', ['categorical_encoder']),
    ('BM03', 'bank_BM03_Imblearn.pkl', ['categorical_encoder', 'fallback_encoder']),
    ('BM04', 'bank_BM04_Imblearn.pkl', ['fallback_encoder']),
    ('BM05', 'bank_BM05_Imblearn.pkl', ['fallback_encoder']),
    ('BM06', 'bank_BM06_Imblearn.pkl', ['fallback_encoder']),
    ('BM07', 'bank_BM07_Imblearn.pkl', ['fallback_encoder']),
    ('BM08', 'bank_BM08_Imblearn.pkl', ['fallback_encoder']),
]

experiment_list = [
    {
        'name': name,
        'datasets': ['bank'],
        'pipeline': os.path.join(PIPELINE_BASE_PATH[1:], 'fpp', filename),
        'essential_steps': steps,
    }
    for name, filename, steps in experiments
]

generate_yaml(
        MODELS_NAN,
        DATASETS,
        METRICS,
        CONFIG_BASE_PATH,
        'exp_FPP_bank.yaml',
        experiment_list
)

In [None]:
%cd ..
# %run -i ./experiment_runner.py --config ./experiments/yaml_files/exp_FPP_bank.yaml --output ./experiments/ --sample_size 10  --same_samples_across_experiments --replace
%cd notebooks

# fair preprocessing: German dataset


In [None]:
from PIPE_X.utilities import generate_yaml
from pipelines.fpp_pipelines_pickle import get_gc10_pipeline, get_gc1_pipeline, get_gc2_pipeline, get_gc3_pipeline, \
    get_gc4_pipeline, get_gc5_pipeline, get_gc6_pipeline, get_gc7_pipeline, get_gc8_pipeline, get_gc9_pipeline
from data.german.setup import german_dataset

german_dataset(DATA_BASE_PATH)

pipelines = [
    (get_gc1_pipeline, 'german_GC01_Imblearn.pkl'),
    (get_gc2_pipeline, 'german_GC02_Imblearn.pkl'),
    (get_gc3_pipeline, 'german_GC03_Imblearn.pkl'),
    (get_gc4_pipeline, 'german_GC04_Imblearn.pkl'),
    (get_gc5_pipeline, 'german_GC05_Imblearn.pkl'),
    (get_gc6_pipeline, 'german_GC06_Imblearn.pkl'),
    (get_gc7_pipeline, 'german_GC07_Imblearn.pkl'),
    (get_gc8_pipeline, 'german_GC08_Imblearn.pkl'),
    (get_gc9_pipeline, 'german_GC09_Imblearn.pkl'),
    (get_gc10_pipeline, 'german_GC10_Imblearn.pkl'),
]

for generator, name in pipelines:
    pickle_pipeline(generator(), os.path.join(PIPELINE_BASE_PATH, 'fpp'), name)

experiments = [
    ('GC01', 'german_GC01_Imblearn.pkl', ['fallback_encoder']),
    ('GC02', 'german_GC02_Imblearn.pkl', ['fallback_encoder']),
    ('GC03', 'german_GC03_Imblearn.pkl', ['fallback_encoder']),
    ('GC04', 'german_GC04_Imblearn.pkl', ['categorical_encoder', 'fallback_encoder']),
    ('GC05', 'german_GC05_Imblearn.pkl', ['fallback_encoder']),
    ('GC06', 'german_GC06_Imblearn.pkl', ['categorical_encoder', 'fallback_encoder']),
    ('GC07', 'german_GC07_Imblearn.pkl', ['fallback_encoder']),
    ('GC08', 'german_GC08_Imblearn.pkl', ['fallback_encoder']),
    ('GC09', 'german_GC09_Imblearn.pkl', ['fallback_encoder']),
    ('GC10', 'german_GC10_Imblearn.pkl', ['fallback_encoder']),
]

experiment_list = [
    {
        'name': name,
        'datasets': ['german'],
        'pipeline': os.path.join(PIPELINE_BASE_PATH[1:], 'fpp', filename),
        'essential_steps': steps,
    }
    for name, filename, steps in experiments
]

generate_yaml(
        MODEL_NAMES,
        DATASETS,
        METRICS,
        CONFIG_BASE_PATH,
        'exp_FPP_german.yaml',
        experiment_list
)

In [None]:
%cd ..
# %run -i ./experiment_runner.py --config ./experiments/yaml_files/exp_FPP_german.yaml --output ./experiments/ --sample_size 10  --same_samples_across_experiments --replace
%cd notebooks

# fair preprocessing: Titanic dataset


In [None]:
from data.titanic.setup import main
from PIPE_X.utilities import generate_yaml
from pipelines.fpp_pipelines_pickle import get_tt1_pipeline, get_tt7_pipeline, get_tt8_pipeline

# main()

pipelines = [
    (get_tt1_pipeline, 'titanic_TT01_Imblearn.pkl'),
    (get_tt7_pipeline, 'titanic_TT07_Imblearn.pkl'),
    (get_tt8_pipeline, 'titanic_TT08_Imblearn.pkl'),
]

for generator, name in pipelines:
    pickle_pipeline(generator(), os.path.join(PIPELINE_BASE_PATH, 'fpp'), name)

experiments = [
    ('TT01', 'titanic_TT01_Imblearn.pkl', ['fallback_encoder', 'categorical_encoder']),
    ('TT07', 'titanic_TT07_Imblearn.pkl', ['fallback_encoder', 'categorical_encoder']),
    ('TT08', 'titanic_TT08_Imblearn.pkl', ['fallback_encoder', 'categorical_encoder']),
]

experiment_list = [
    {
        'name': name,
        'datasets': ['titanic'],
        'pipeline': os.path.join(PIPELINE_BASE_PATH[1:], 'fpp', filename),
        'essential_steps': steps,
    }
    for name, filename, steps in experiments
]

generate_yaml(
        MODELS_NAN,
        DATASETS,
        METRICS,
        CONFIG_BASE_PATH,
        'exp_FPP_titanic.yaml',
        experiment_list
)

In [None]:
%cd ..
# %run -i ./experiment_runner.py --config ./experiments/yaml_files/exp_FPP_titanic.yaml --output ./experiments/ --sample_size 10  --same_samples_across_experiments --replace
%cd notebooks

# fair preprocessing: Compas dataset


In [None]:
from pipelines.fpp_pipelines_pickle import get_cp1_pipeline

# Pickle compas pipeline
pickle_pipeline(
        get_cp1_pipeline(),
        os.path.join(PIPELINE_BASE_PATH, 'fair_preprocessing'),
        'compas_CP1.pkl'
)