In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import sys

project_root = os.path.abspath(os.path.join(os.getcwd(), ".."))
if project_root not in sys.path:
    sys.path.insert(0, project_root)

In [3]:
import os
import numpy as np

from data.adult.setup import adult_noisy_dataset
from data.adult.setup import adult_dataset

from pipelines.sanity_check_pipelines_pickle import get_sc_destroyer_pipeline
from pipelines.sanity_check_pipelines_pickle import get_sc_imputer_pipeline

from PIPE_X.utilities import generate_yaml
from PIPE_X.pipeline import AbstractPipeline, pickle_pipeline

# prepare

In [4]:
CONFIG_BASE_PATH = '../experiments/yaml_files'
DATA_BASE_PATH = '../data'
PIPELINE_BASE_PATH = '../pipelines/pickles'

MODEL_NAMES = [
    'LR',
    'DT',
    'GBT',
    'KNN',
    'HGB',
    'NN',
    'SVC',
]

MODELS_NAN = [  # models able to handle NaN
    'DT',
    'GBT',
    'HGB',
]

DATASETS = [
    ('adult', './data/adult/adult.train.csv'),
    ('adult_numerical', './data/adult_numerical/adult_numerical.train.csv'),
    ('bank', './data/bank/bank.csv'),
    ('compas', './data/compas/compas.csv'),
    ('german', './data/german/german.csv'),
    ('phishing', './data/phishing/phishing.csv'),
    ('titanic', './data/titanic/titanic.csv'),
]

METRICS = [
    'immediate_impact',
    'leave_out_impact',
]

DEFAULT_PROBABILITIES_DESTROYER = [i * 0.04 for i in range(0, 26)]
DEFAULT_PROBABILITIES_IMPUTER = [i * 0.04 for i in range(0, 21)]

DESTROYER_CONSTANT = 1.0

## Sanity Check: Destroyer

In [5]:
def generate_destroyer_experiment(name: str, marital_status: bool, scaled: bool, missing: bool) -> AbstractPipeline:
    """
    Create and pickle a set of destroyer pipelines for a range of probabilities and
    generate an experiments YAML configuration.

    Args:
        name (str): Prefix/name used for produced pipeline files and YAML entries.
        marital_status (bool): If True, include the marital-status replacement
            transformer in each pipeline.
        scaled (bool): If True, use the scaled ordering of numerical and destroyer steps.
        missing (bool): If True, use the with missing values.

    Behavior:
        - For each probability in `DEFAULT_PROBABILITIES_DESTROYER`, construct a pipeline
          using `get_sc_destroyer_pipeline` with a deterministic RNG (seed 42) and pickle it
          to a file under `PIPELINE_BASE_PATH`.
        - Ensure the `adult` dataset is prepared.
        - Build `experiment_list` containing one entry per probability and write the YAML
          config via `generate_yaml`.
    """
    # Pickle pipelines
    destroyer_pipeline = None
    for probability in DEFAULT_PROBABILITIES_DESTROYER:
        destroyer_pipeline = get_sc_destroyer_pipeline(np.random.default_rng(42), probability, DESTROYER_CONSTANT,
                                                       marital_status=marital_status, scaled=scaled)
        pickle_pipeline(
                destroyer_pipeline,
                os.path.join(PIPELINE_BASE_PATH, name[:12]),
                f'{name}_{probability:05f}.pkl'
        )

    # Prepare data
    adult_noisy_dataset(probabilities=[0.0], path=DATA_BASE_PATH)
    adult_dataset(DATA_BASE_PATH)

    # Generate yaml config files
    experiment_list = []
    for probability in DEFAULT_PROBABILITIES_DESTROYER:
        experiment_list.append({
            'name': f'{name}_adult_{probability:05f}',
            'datasets': [f'adult_noisy_{0.0:05f}'] if missing else ['adult'],
            'pipeline': os.path.join(PIPELINE_BASE_PATH[1:], name[:12],
                                     f'{name}_{probability:05f}.pkl'),
            'essential_steps': ['categorical_encoder'],
            'param': probability,
        })

    generate_yaml(
            MODELS_NAN if missing else MODEL_NAMES,
            [(f'adult_noisy_{0.0:05f}', f'./data/adult_noisy/adult_{0.0:05f}.train.csv')] if missing else DATASETS,
            METRICS,
            CONFIG_BASE_PATH,
            f'exp_{name}.yaml',
            experiment_list
    )
    return destroyer_pipeline

In [6]:
generate_destroyer_experiment('sc_destroyer_unscaled', marital_status=True, scaled=False, missing=False)
generate_destroyer_experiment('sc_destroyer_scaled', marital_status=True, scaled=True, missing=False)
generate_destroyer_experiment('sc_destroyer_unscaled_minimal', marital_status=False, scaled=False, missing=False)
generate_destroyer_experiment('sc_destroyer_scaled_minimal', marital_status=False, scaled=True, missing=False)
generate_destroyer_experiment('sc_destroyer_unscaled_missing', marital_status=True, scaled=False, missing=True)
generate_destroyer_experiment('sc_destroyer_scaled_missing', marital_status=True, scaled=True, missing=True)
generate_destroyer_experiment('sc_destroyer_unscaled_minimal_missing', marital_status=False, scaled=False, missing=True)
generate_destroyer_experiment('sc_destroyer_scaled_minimal_missing', marital_status=False, scaled=True, missing=True)

In [7]:
%cd ..
# %run -i ./experiment_runner.py --config ./experiments/yaml_files/exp_sc_destroyer_unscaled.yaml --output ./experiments/ --sample_size 10 --same_samples_across_experiments --replace
# %run -i ./experiment_runner.py --config ./experiments/yaml_files/exp_sc_destroyer_scaled.yaml --output ./experiments/ --sample_size 10 --same_samples_across_experiments --replace
# %run -i ./experiment_runner.py --config ./experiments/yaml_files/exp_sc_destroyer_unscaled_minimal.yaml --output ./experiments/ --sample_size 10 --same_samples_across_experiments --replace
# %run -i ./experiment_runner.py --config ./experiments/yaml_files/exp_sc_destroyer_scaled_minimal.yaml --output ./experiments/ --sample_size 10 --same_samples_across_experiments --replace
# %run -i ./experiment_runner.py --config ./experiments/yaml_files/exp_sc_destroyer_unscaled_missing.yaml --output ./experiments/ --sample_size 10 --same_samples_across_experiments --replace
# %run -i ./experiment_runner.py --config ./experiments/yaml_files/exp_sc_destroyer_scaled_missing.yaml --output ./experiments/ --sample_size 10 --same_samples_across_experiments --replace
# %run -i ./experiment_runner.py --config ./experiments/yaml_files/exp_sc_destroyer_unscaled_minimal_missing.yaml --output ./experiments/ --sample_size 10 --same_samples_across_experiments --replace
# %run -i ./experiment_runner.py --config ./experiments/yaml_files/exp_sc_destroyer_scaled_minimal_missing.yaml --output ./experiments/ --sample_size 10 --same_samples_across_experiments --replace
%cd notebooks

/home/ngeisler/XAI-for-Preprocessing
Running 1 experiment configurations in config file.
Starting experiment 1 of 1
LR for sc_destroyer_unscaled_adult_0.020000 on adult
2025-10-24 17:30:34.181736
Essential steps: categorical_encoder
Steps that can be inspected: replace_marital_status, categorical_imputer, numerical_imputer, numerical_scaler, destroyer


LR for sc_destroyer_unscaled_adult_0.020000 on adult:   0%|          | 0/1 [00:00<?, ?it/s]

DT for sc_destroyer_unscaled_adult_0.020000 on adult
2025-10-24 17:30:35.235794
Essential steps: categorical_encoder
Steps that can be inspected: replace_marital_status, categorical_imputer, numerical_imputer, numerical_scaler, destroyer


DT for sc_destroyer_unscaled_adult_0.020000 on adult:   0%|          | 0/1 [00:00<?, ?it/s]

GBT for sc_destroyer_unscaled_adult_0.020000 on adult
2025-10-24 17:30:36.876697
Essential steps: categorical_encoder
Steps that can be inspected: replace_marital_status, categorical_imputer, numerical_imputer, numerical_scaler, destroyer


GBT for sc_destroyer_unscaled_adult_0.020000 on adult:   0%|          | 0/1 [00:00<?, ?it/s]

KNN for sc_destroyer_unscaled_adult_0.020000 on adult
2025-10-24 17:30:45.885821
Essential steps: categorical_encoder
Steps that can be inspected: replace_marital_status, categorical_imputer, numerical_imputer, numerical_scaler, destroyer


KNN for sc_destroyer_unscaled_adult_0.020000 on adult:   0%|          | 0/1 [00:00<?, ?it/s]

HGB for sc_destroyer_unscaled_adult_0.020000 on adult
2025-10-24 17:30:46.879999
Essential steps: categorical_encoder
Steps that can be inspected: replace_marital_status, categorical_imputer, numerical_imputer, numerical_scaler, destroyer


HGB for sc_destroyer_unscaled_adult_0.020000 on adult:   0%|          | 0/1 [00:00<?, ?it/s]

NN for sc_destroyer_unscaled_adult_0.020000 on adult
2025-10-24 17:30:53.749744
Essential steps: categorical_encoder
Steps that can be inspected: replace_marital_status, categorical_imputer, numerical_imputer, numerical_scaler, destroyer


NN for sc_destroyer_unscaled_adult_0.020000 on adult:   0%|          | 0/1 [00:00<?, ?it/s]

SVC for sc_destroyer_unscaled_adult_0.020000 on adult
2025-10-24 17:31:02.838912
Essential steps: categorical_encoder
Steps that can be inspected: replace_marital_status, categorical_imputer, numerical_imputer, numerical_scaler, destroyer


SVC for sc_destroyer_unscaled_adult_0.020000 on adult:   0%|          | 0/1 [00:00<?, ?it/s]

2025-10-24 17:31:23.108904
Running 1 experiment configurations in config file.
Starting experiment 1 of 1
LR for sc_destroyer_scaled_adult_0.020000 on adult
2025-10-24 17:31:23.137738
Essential steps: categorical_encoder
Steps that can be inspected: replace_marital_status, categorical_imputer, destroyer, numerical_imputer, numerical_scaler


LR for sc_destroyer_scaled_adult_0.020000 on adult:   0%|          | 0/1 [00:00<?, ?it/s]

DT for sc_destroyer_scaled_adult_0.020000 on adult
2025-10-24 17:31:24.216470
Essential steps: categorical_encoder
Steps that can be inspected: replace_marital_status, categorical_imputer, destroyer, numerical_imputer, numerical_scaler


DT for sc_destroyer_scaled_adult_0.020000 on adult:   0%|          | 0/1 [00:00<?, ?it/s]

GBT for sc_destroyer_scaled_adult_0.020000 on adult
2025-10-24 17:31:25.848283
Essential steps: categorical_encoder
Steps that can be inspected: replace_marital_status, categorical_imputer, destroyer, numerical_imputer, numerical_scaler


GBT for sc_destroyer_scaled_adult_0.020000 on adult:   0%|          | 0/1 [00:00<?, ?it/s]

KNN for sc_destroyer_scaled_adult_0.020000 on adult
2025-10-24 17:31:34.284437
Essential steps: categorical_encoder
Steps that can be inspected: replace_marital_status, categorical_imputer, destroyer, numerical_imputer, numerical_scaler


KNN for sc_destroyer_scaled_adult_0.020000 on adult:   0%|          | 0/1 [00:00<?, ?it/s]

HGB for sc_destroyer_scaled_adult_0.020000 on adult
2025-10-24 17:31:35.296257
Essential steps: categorical_encoder
Steps that can be inspected: replace_marital_status, categorical_imputer, destroyer, numerical_imputer, numerical_scaler


HGB for sc_destroyer_scaled_adult_0.020000 on adult:   0%|          | 0/1 [00:00<?, ?it/s]

NN for sc_destroyer_scaled_adult_0.020000 on adult
2025-10-24 17:31:42.060261
Essential steps: categorical_encoder
Steps that can be inspected: replace_marital_status, categorical_imputer, destroyer, numerical_imputer, numerical_scaler


NN for sc_destroyer_scaled_adult_0.020000 on adult:   0%|          | 0/1 [00:00<?, ?it/s]

SVC for sc_destroyer_scaled_adult_0.020000 on adult
2025-10-24 17:31:53.485976
Essential steps: categorical_encoder
Steps that can be inspected: replace_marital_status, categorical_imputer, destroyer, numerical_imputer, numerical_scaler


SVC for sc_destroyer_scaled_adult_0.020000 on adult:   0%|          | 0/1 [00:00<?, ?it/s]

2025-10-24 17:32:13.167452
Running 1 experiment configurations in config file.
Starting experiment 1 of 1
LR for sc_destroyer_unscaled_minimal_adult_0.020000 on adult
2025-10-24 17:32:13.194062
Essential steps: categorical_encoder
Steps that can be inspected: categorical_imputer, numerical_imputer, numerical_scaler, destroyer


LR for sc_destroyer_unscaled_minimal_adult_0.020000 on adult:   0%|          | 0/1 [00:00<?, ?it/s]

DT for sc_destroyer_unscaled_minimal_adult_0.020000 on adult
2025-10-24 17:32:13.956655
Essential steps: categorical_encoder
Steps that can be inspected: categorical_imputer, numerical_imputer, numerical_scaler, destroyer


DT for sc_destroyer_unscaled_minimal_adult_0.020000 on adult:   0%|          | 0/1 [00:00<?, ?it/s]

GBT for sc_destroyer_unscaled_minimal_adult_0.020000 on adult
2025-10-24 17:32:15.255016
Essential steps: categorical_encoder
Steps that can be inspected: categorical_imputer, numerical_imputer, numerical_scaler, destroyer


GBT for sc_destroyer_unscaled_minimal_adult_0.020000 on adult:   0%|          | 0/1 [00:00<?, ?it/s]

KNN for sc_destroyer_unscaled_minimal_adult_0.020000 on adult
2025-10-24 17:32:22.093019
Essential steps: categorical_encoder
Steps that can be inspected: categorical_imputer, numerical_imputer, numerical_scaler, destroyer


KNN for sc_destroyer_unscaled_minimal_adult_0.020000 on adult:   0%|          | 0/1 [00:00<?, ?it/s]

HGB for sc_destroyer_unscaled_minimal_adult_0.020000 on adult
2025-10-24 17:32:22.871512
Essential steps: categorical_encoder
Steps that can be inspected: categorical_imputer, numerical_imputer, numerical_scaler, destroyer


HGB for sc_destroyer_unscaled_minimal_adult_0.020000 on adult:   0%|          | 0/1 [00:00<?, ?it/s]

NN for sc_destroyer_unscaled_minimal_adult_0.020000 on adult
2025-10-24 17:32:28.369062
Essential steps: categorical_encoder
Steps that can be inspected: categorical_imputer, numerical_imputer, numerical_scaler, destroyer


NN for sc_destroyer_unscaled_minimal_adult_0.020000 on adult:   0%|          | 0/1 [00:00<?, ?it/s]

SVC for sc_destroyer_unscaled_minimal_adult_0.020000 on adult
2025-10-24 17:32:37.314333
Essential steps: categorical_encoder
Steps that can be inspected: categorical_imputer, numerical_imputer, numerical_scaler, destroyer


SVC for sc_destroyer_unscaled_minimal_adult_0.020000 on adult:   0%|          | 0/1 [00:00<?, ?it/s]

2025-10-24 17:32:56.164847
Running 1 experiment configurations in config file.
Starting experiment 1 of 1
LR for sc_destroyer_scaled_minimal_adult_0.020000 on adult
2025-10-24 17:32:56.191566
Essential steps: categorical_encoder
Steps that can be inspected: categorical_imputer, destroyer, numerical_imputer, numerical_scaler


LR for sc_destroyer_scaled_minimal_adult_0.020000 on adult:   0%|          | 0/1 [00:00<?, ?it/s]

DT for sc_destroyer_scaled_minimal_adult_0.020000 on adult
2025-10-24 17:32:57.013143
Essential steps: categorical_encoder
Steps that can be inspected: categorical_imputer, destroyer, numerical_imputer, numerical_scaler


DT for sc_destroyer_scaled_minimal_adult_0.020000 on adult:   0%|          | 0/1 [00:00<?, ?it/s]

GBT for sc_destroyer_scaled_minimal_adult_0.020000 on adult
2025-10-24 17:32:58.327479
Essential steps: categorical_encoder
Steps that can be inspected: categorical_imputer, destroyer, numerical_imputer, numerical_scaler


GBT for sc_destroyer_scaled_minimal_adult_0.020000 on adult:   0%|          | 0/1 [00:00<?, ?it/s]

KNN for sc_destroyer_scaled_minimal_adult_0.020000 on adult
2025-10-24 17:33:05.296229
Essential steps: categorical_encoder
Steps that can be inspected: categorical_imputer, destroyer, numerical_imputer, numerical_scaler


KNN for sc_destroyer_scaled_minimal_adult_0.020000 on adult:   0%|          | 0/1 [00:00<?, ?it/s]

HGB for sc_destroyer_scaled_minimal_adult_0.020000 on adult
2025-10-24 17:33:06.080570
Essential steps: categorical_encoder
Steps that can be inspected: categorical_imputer, destroyer, numerical_imputer, numerical_scaler


HGB for sc_destroyer_scaled_minimal_adult_0.020000 on adult:   0%|          | 0/1 [00:00<?, ?it/s]

NN for sc_destroyer_scaled_minimal_adult_0.020000 on adult
2025-10-24 17:33:11.645113
Essential steps: categorical_encoder
Steps that can be inspected: categorical_imputer, destroyer, numerical_imputer, numerical_scaler


NN for sc_destroyer_scaled_minimal_adult_0.020000 on adult:   0%|          | 0/1 [00:00<?, ?it/s]

SVC for sc_destroyer_scaled_minimal_adult_0.020000 on adult
2025-10-24 17:33:20.407113
Essential steps: categorical_encoder
Steps that can be inspected: categorical_imputer, destroyer, numerical_imputer, numerical_scaler


SVC for sc_destroyer_scaled_minimal_adult_0.020000 on adult:   0%|          | 0/1 [00:00<?, ?it/s]

2025-10-24 17:33:39.950642
Running 1 experiment configurations in config file.
Starting experiment 1 of 1
DT for sc_destroyer_unscaled_missing_adult_0.020000 on adult_noisy_0.000000
2025-10-24 17:33:39.983269
Essential steps: categorical_encoder
Steps that can be inspected: replace_marital_status, categorical_imputer, numerical_imputer, numerical_scaler, destroyer


DT for sc_destroyer_unscaled_missing_adult_0.020000 on adult_noisy_0.000000:   0%|          | 0/1 [00:00<?, ?i…

GBT for sc_destroyer_unscaled_missing_adult_0.020000 on adult_noisy_0.000000
2025-10-24 17:33:56.920873
Essential steps: categorical_encoder
Steps that can be inspected: replace_marital_status, categorical_imputer, numerical_imputer, numerical_scaler, destroyer


GBT for sc_destroyer_unscaled_missing_adult_0.020000 on adult_noisy_0.000000:   0%|          | 0/1 [00:00<?, ?…

HGB for sc_destroyer_unscaled_missing_adult_0.020000 on adult_noisy_0.000000
2025-10-24 17:34:20.633498
Essential steps: categorical_encoder
Steps that can be inspected: replace_marital_status, categorical_imputer, numerical_imputer, numerical_scaler, destroyer


HGB for sc_destroyer_unscaled_missing_adult_0.020000 on adult_noisy_0.000000:   0%|          | 0/1 [00:00<?, ?…

2025-10-24 17:34:42.376625
Running 1 experiment configurations in config file.
Starting experiment 1 of 1
DT for sc_destroyer_scaled_missing_adult_0.020000 on adult_noisy_0.000000
2025-10-24 17:34:42.409818
Essential steps: categorical_encoder
Steps that can be inspected: replace_marital_status, categorical_imputer, destroyer, numerical_imputer, numerical_scaler


DT for sc_destroyer_scaled_missing_adult_0.020000 on adult_noisy_0.000000:   0%|          | 0/1 [00:00<?, ?it/…

GBT for sc_destroyer_scaled_missing_adult_0.020000 on adult_noisy_0.000000
2025-10-24 17:34:59.111181
Essential steps: categorical_encoder
Steps that can be inspected: replace_marital_status, categorical_imputer, destroyer, numerical_imputer, numerical_scaler


GBT for sc_destroyer_scaled_missing_adult_0.020000 on adult_noisy_0.000000:   0%|          | 0/1 [00:00<?, ?it…

HGB for sc_destroyer_scaled_missing_adult_0.020000 on adult_noisy_0.000000
2025-10-24 17:35:22.891839
Essential steps: categorical_encoder
Steps that can be inspected: replace_marital_status, categorical_imputer, destroyer, numerical_imputer, numerical_scaler


HGB for sc_destroyer_scaled_missing_adult_0.020000 on adult_noisy_0.000000:   0%|          | 0/1 [00:00<?, ?it…

2025-10-24 17:35:44.575558
Running 1 experiment configurations in config file.
Starting experiment 1 of 1
DT for sc_destroyer_unscaled_minimal_missing_adult_0.020000 on adult_noisy_0.000000
2025-10-24 17:35:44.603262
Essential steps: categorical_encoder
Steps that can be inspected: categorical_imputer, numerical_imputer, numerical_scaler, destroyer


DT for sc_destroyer_unscaled_minimal_missing_adult_0.020000 on adult_noisy_0.000000:   0%|          | 0/1 [00:…

GBT for sc_destroyer_unscaled_minimal_missing_adult_0.020000 on adult_noisy_0.000000
2025-10-24 17:35:58.540556
Essential steps: categorical_encoder
Steps that can be inspected: categorical_imputer, numerical_imputer, numerical_scaler, destroyer


GBT for sc_destroyer_unscaled_minimal_missing_adult_0.020000 on adult_noisy_0.000000:   0%|          | 0/1 [00…

HGB for sc_destroyer_unscaled_minimal_missing_adult_0.020000 on adult_noisy_0.000000
2025-10-24 17:36:18.118919
Essential steps: categorical_encoder
Steps that can be inspected: categorical_imputer, numerical_imputer, numerical_scaler, destroyer


HGB for sc_destroyer_unscaled_minimal_missing_adult_0.020000 on adult_noisy_0.000000:   0%|          | 0/1 [00…

2025-10-24 17:36:35.968142
Running 1 experiment configurations in config file.
Starting experiment 1 of 1
DT for sc_destroyer_scaled_minimal_missing_adult_0.020000 on adult_noisy_0.000000
2025-10-24 17:36:35.997213
Essential steps: categorical_encoder
Steps that can be inspected: categorical_imputer, destroyer, numerical_imputer, numerical_scaler


DT for sc_destroyer_scaled_minimal_missing_adult_0.020000 on adult_noisy_0.000000:   0%|          | 0/1 [00:00…

GBT for sc_destroyer_scaled_minimal_missing_adult_0.020000 on adult_noisy_0.000000
2025-10-24 17:36:49.911640
Essential steps: categorical_encoder
Steps that can be inspected: categorical_imputer, destroyer, numerical_imputer, numerical_scaler


GBT for sc_destroyer_scaled_minimal_missing_adult_0.020000 on adult_noisy_0.000000:   0%|          | 0/1 [00:0…

HGB for sc_destroyer_scaled_minimal_missing_adult_0.020000 on adult_noisy_0.000000
2025-10-24 17:37:09.537606
Essential steps: categorical_encoder
Steps that can be inspected: categorical_imputer, destroyer, numerical_imputer, numerical_scaler


HGB for sc_destroyer_scaled_minimal_missing_adult_0.020000 on adult_noisy_0.000000:   0%|          | 0/1 [00:0…

2025-10-24 17:37:27.444668
/home/ngeisler/XAI-for-Preprocessing/notebooks


# sanity check: imputer

In [8]:
def generate_imputer_experiment(name: str, marital_status: bool, redundant: bool):
    """
    Create, pickle, and register an imputer pipeline for a range of noisy adult datasets,
    to conduct imputation experiments

    This function:
    - Builds an imputer pipeline using `get_sc_imputer_pipeline` with the given
      `marital_status` and `redundant` options and pickles it to `PIPELINE_BASE_PATH`.
    - Prepares noisy adult datasets for all probabilities in `DEFAULT_PROBABILITIES_IMPUTER`
      by calling `adult_noisy_dataset`.
    - Constructs an experiment list entry per probability and writes a YAML config
      via `generate_yaml`.

    Args:
        name (str): Base name used for the pickled pipeline file and for naming experiments.
        marital_status (bool): If True, include marital-status replacement in the pipeline.
        redundant (bool): If True, build a redundant pipeline variant (keeps extra columns/steps).

    Side effects:
        - Writes a pickled pipeline under `PIPELINE_BASE_PATH`.
        - Writes noisy dataset CSVs under `DATA_BASE_PATH` via `adult_noisy_dataset`.
        - Writes an experiments YAML file under `CONFIG_BASE_PATH`.

    Returns:
        The created imputer pipeline instance (returned for convenience).
    """
    # Pickle pipeline
    imputer_pipeline = get_sc_imputer_pipeline(marital_status=marital_status, redundant=redundant)
    pickle_pipeline(
            imputer_pipeline,
            os.path.join(PIPELINE_BASE_PATH, name[:10]),
            f'{name}.pkl'
    )

    # Prepare data
    adult_noisy_dataset(DEFAULT_PROBABILITIES_IMPUTER, DATA_BASE_PATH)
    noisy_datasets = [
        (f'adult_noisy_{i:05f}', f'./data/adult_noisy/adult_{i:05f}.train.csv')
        for i in DEFAULT_PROBABILITIES_IMPUTER
    ]

    # Generate yaml config file
    experiment_list = []
    for probability in DEFAULT_PROBABILITIES_IMPUTER:
        experiment_list.append({
            'name': f'{name}_adult_{probability:05f}',
            'datasets': [f'adult_noisy_{probability:05f}'],
            'pipeline': os.path.join(PIPELINE_BASE_PATH[1:], name[:10], f'{name}.pkl'),
            'essential_steps': ['categorical_encoder', 'race_encoder'],
            'dataset_for_sampling': f'adult_noisy_{DEFAULT_PROBABILITIES_IMPUTER[-1]:05f}',
            'param': probability,
        })

    generate_yaml(
            MODELS_NAN,
            noisy_datasets,
            METRICS,
            CONFIG_BASE_PATH,
            f'exp_{name}.yaml',
            experiment_list
    )
    return imputer_pipeline

## Redundant


In [9]:
generate_imputer_experiment('sc_imputer_redundant', marital_status=True, redundant=True)
generate_imputer_experiment('sc_imputer_nonredundant', marital_status=True, redundant=False)
generate_imputer_experiment('sc_imputer_redundant_minimal', marital_status=False, redundant=True)
generate_imputer_experiment('sc_imputer_nonredundant_minimal', marital_status=False, redundant=False)

In [10]:
%cd ..
# %run -i ./experiment_runner.py --config ./experiments/yaml_files/exp_sc_imputer_redundant.yaml --output ./experiments/ --sample_size 10 --same_samples_across_experiments --replace
# %run -i ./experiment_runner.py --config ./experiments/yaml_files/exp_sc_imputer_nonredundant.yaml --output ./experiments/ --sample_size 10 --same_samples_across_experiments --replace
# %run -i ./experiment_runner.py --config ./experiments/yaml_files/exp_sc_imputer_redundant_minimal.yaml --output ./experiments/ --sample_size 10 --same_samples_across_experiments --replace
# %run -i ./experiment_runner.py --config ./experiments/yaml_files/exp_sc_imputer_nonredundant_minimal.yaml --output ./experiments/ --sample_size 10 --same_samples_across_experiments --replace
%cd notebooks

/home/ngeisler/XAI-for-Preprocessing
Running 1 experiment configurations in config file.
Starting experiment 1 of 1
DT for sc_imputer_redundant_adult_0.020000 on adult_noisy_0.020000
2025-10-24 17:37:28.202318
Essential steps: race_encoder, categorical_encoder
Steps that can be inspected: education-num_imputer, capital-gain_imputer, race_imputer, replace_marital-status, categorical_imputer, numerical_imputer, numerical_scaler


DT for sc_imputer_redundant_adult_0.020000 on adult_noisy_0.020000:   0%|          | 0/1 [00:00<?, ?it/s]

GBT for sc_imputer_redundant_adult_0.020000 on adult_noisy_0.020000
2025-10-24 17:37:57.207988
Essential steps: race_encoder, categorical_encoder
Steps that can be inspected: education-num_imputer, capital-gain_imputer, race_imputer, replace_marital-status, categorical_imputer, numerical_imputer, numerical_scaler


GBT for sc_imputer_redundant_adult_0.020000 on adult_noisy_0.020000:   0%|          | 0/1 [00:00<?, ?it/s]

HGB for sc_imputer_redundant_adult_0.020000 on adult_noisy_0.020000
2025-10-24 17:38:35.987727
Essential steps: race_encoder, categorical_encoder
Steps that can be inspected: education-num_imputer, capital-gain_imputer, race_imputer, replace_marital-status, categorical_imputer, numerical_imputer, numerical_scaler


HGB for sc_imputer_redundant_adult_0.020000 on adult_noisy_0.020000:   0%|          | 0/1 [00:00<?, ?it/s]

2025-10-24 17:39:11.366344
Running 1 experiment configurations in config file.
Starting experiment 1 of 1
DT for sc_imputer_nonredundant_adult_0.020000 on adult_noisy_0.020000
2025-10-24 17:39:11.394818
Essential steps: race_encoder, categorical_encoder
Steps that can be inspected: education-num_imputer, capital-gain_imputer, race_imputer, replace_marital-status, numerical_imputer, categorical_imputer, numerical_scaler


DT for sc_imputer_nonredundant_adult_0.020000 on adult_noisy_0.020000:   0%|          | 0/1 [00:00<?, ?it/s]

GBT for sc_imputer_nonredundant_adult_0.020000 on adult_noisy_0.020000
2025-10-24 17:39:19.491151
Essential steps: race_encoder, categorical_encoder
Steps that can be inspected: education-num_imputer, capital-gain_imputer, race_imputer, replace_marital-status, numerical_imputer, categorical_imputer, numerical_scaler


GBT for sc_imputer_nonredundant_adult_0.020000 on adult_noisy_0.020000:   0%|          | 0/1 [00:00<?, ?it/s]

HGB for sc_imputer_nonredundant_adult_0.020000 on adult_noisy_0.020000
2025-10-24 17:39:37.207174
Essential steps: race_encoder, categorical_encoder
Steps that can be inspected: education-num_imputer, capital-gain_imputer, race_imputer, replace_marital-status, numerical_imputer, categorical_imputer, numerical_scaler


HGB for sc_imputer_nonredundant_adult_0.020000 on adult_noisy_0.020000:   0%|          | 0/1 [00:00<?, ?it/s]

2025-10-24 17:39:51.859670
Running 1 experiment configurations in config file.
Starting experiment 1 of 1
DT for sc_imputer_redundant_minimal_adult_0.020000 on adult_noisy_0.020000
2025-10-24 17:39:51.890565
Essential steps: race_encoder, categorical_encoder
Steps that can be inspected: education-num_imputer, capital-gain_imputer, race_imputer, categorical_imputer, numerical_imputer, numerical_scaler


DT for sc_imputer_redundant_minimal_adult_0.020000 on adult_noisy_0.020000:   0%|          | 0/1 [00:00<?, ?it…

GBT for sc_imputer_redundant_minimal_adult_0.020000 on adult_noisy_0.020000
2025-10-24 17:40:17.538899
Essential steps: race_encoder, categorical_encoder
Steps that can be inspected: education-num_imputer, capital-gain_imputer, race_imputer, categorical_imputer, numerical_imputer, numerical_scaler


GBT for sc_imputer_redundant_minimal_adult_0.020000 on adult_noisy_0.020000:   0%|          | 0/1 [00:00<?, ?i…

HGB for sc_imputer_redundant_minimal_adult_0.020000 on adult_noisy_0.020000
2025-10-24 17:40:51.427474
Essential steps: race_encoder, categorical_encoder
Steps that can be inspected: education-num_imputer, capital-gain_imputer, race_imputer, categorical_imputer, numerical_imputer, numerical_scaler


HGB for sc_imputer_redundant_minimal_adult_0.020000 on adult_noisy_0.020000:   0%|          | 0/1 [00:00<?, ?i…

2025-10-24 17:41:22.804298
Running 1 experiment configurations in config file.
Starting experiment 1 of 1
DT for sc_imputer_nonredundant_minimal_adult_0.020000 on adult_noisy_0.020000
2025-10-24 17:41:22.832644
Essential steps: race_encoder, categorical_encoder
Steps that can be inspected: education-num_imputer, capital-gain_imputer, race_imputer, numerical_imputer, categorical_imputer, numerical_scaler


DT for sc_imputer_nonredundant_minimal_adult_0.020000 on adult_noisy_0.020000:   0%|          | 0/1 [00:00<?, …

GBT for sc_imputer_nonredundant_minimal_adult_0.020000 on adult_noisy_0.020000
2025-10-24 17:41:29.765287
Essential steps: race_encoder, categorical_encoder
Steps that can be inspected: education-num_imputer, capital-gain_imputer, race_imputer, numerical_imputer, categorical_imputer, numerical_scaler


GBT for sc_imputer_nonredundant_minimal_adult_0.020000 on adult_noisy_0.020000:   0%|          | 0/1 [00:00<?,…

HGB for sc_imputer_nonredundant_minimal_adult_0.020000 on adult_noisy_0.020000
2025-10-24 17:41:44.880296
Essential steps: race_encoder, categorical_encoder
Steps that can be inspected: education-num_imputer, capital-gain_imputer, race_imputer, numerical_imputer, categorical_imputer, numerical_scaler


HGB for sc_imputer_nonredundant_minimal_adult_0.020000 on adult_noisy_0.020000:   0%|          | 0/1 [00:00<?,…

2025-10-24 17:41:57.176586
/home/ngeisler/XAI-for-Preprocessing/notebooks
