In [None]:
# this cell's metadata contains
# "nbsphinx": "hidden" so it is hidden by nbsphinx

def _set_paths() -> None:
    # set the correct path when launched from within PyCharm

    module_paths = ["pytools", "facet", "sklearndf"]

    import sys
    import os
    
    if 'cwd' not in globals():
        # noinspection PyGlobalUndefined
        global cwd
        cwd = os.path.join(os.getcwd(), os.pardir, os.pardir, os.pardir)
        os.chdir(cwd)   
    print(f"working dir is '{os.getcwd()}'")
    for module_path in module_paths:
        if module_path not in sys.path:
            sys.path.insert(0, os.path.abspath(f"{cwd}/{os.pardir}/{module_path}/src"))
        print(f"added `{sys.path[0]}` to python paths")
        
def _ignore_warnings():
    # ignore irrelevant warnings that would affect the output of this tutorial notebook
    
    # ignore a useless LGBM warning
    import warnings
    warnings.filterwarnings("ignore", category=UserWarning, message=r".*Xcode_8\.3\.3")

_set_paths()
_ignore_warnings()

del _set_paths, _ignore_warnings

In [None]:
# this cell's metadata contains
# "nbsphinx": "hidden" so it is hidden by nbsphinx

import warnings
warnings.filterwarnings('ignore')

def _configure_matplotlib():
    # set global options for matplotlib
    
    import matplotlib
    
    matplotlib.rcParams['figure.figsize'] = (16.0, 8.0)
    matplotlib.rcParams['figure.dpi'] = 72

_configure_matplotlib()

del _configure_matplotlib

# Creating a DataFrame friendly scikit-learn pre-processing pipeline

The titanic data set includes categorical features such as class and sex, and also has missing values for numeric features (i.e., age) and categorical features (i.e., embarked). The aim is to predict whether or not a passenger survived. A standard sklearn example for this dataset can be found [here](https://scikit-learn.org/stable/auto_examples/compose/plot_column_transformer_mixed_types.html#sphx-glr-auto-examples-compose-plot-column-transformer-mixed-types-py)

We will build a preprocessing pipeline which:

- for categorical variables fills missing values with the string Ã¢â‚¬ËœUnknownÃ¢â‚¬â„¢ and then one-hot encodes

- for numerical values fills missing values using median values

The strength of `sklearndf` is to maintain the scikit-learn conventions and expressivity, while also preserving dataframes, and hence feature names. We can see this after using fit_transform on our preprocessing pipeline.

In [None]:
import numpy as np
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split

# Relevant sklearndf imports
from sklearndf.transformation import (
    ColumnTransformerDF,
    OneHotEncoderDF,
    SimpleImputerDF,
)
from sklearndf.pipeline import (
    PipelineDF,
    ClassifierPipelineDF
)
from sklearndf.classification import RandomForestClassifierDF

# Load titanic data
titanic_X, titanic_y = fetch_openml("titanic", version=1, as_frame=True, return_X_y=True)

# Select features
numerical_features = ['age', 'fare']
categorical_features = ['embarked', 'sex', 'pclass']

# Create a pre-processing pipeline
preprocessing_numeric_df = SimpleImputerDF(strategy="median")

preprocessing_categorical_df = PipelineDF(
    steps=[
        ('imputer', SimpleImputerDF(strategy='constant', fill_value='Unknown')),
        ('one-hot', OneHotEncoderDF(sparse=False, handle_unknown="ignore"))
    ]
)

preprocessing_df = ColumnTransformerDF(
    transformers=[
        ('categorical', preprocessing_categorical_df, categorical_features),
        ('numeric', preprocessing_numeric_df, numerical_features),
    ]
)

# Run pre-processing
transformed_df = preprocessing_df.fit_transform(X=titanic_X, y=titanic_y)
transformed_df.head()

# Tracing features from post-transform to original

The `sklearndf` pipeline has a features_original attribute which returns a series mapping the output columns (the seriesÃ¢â‚¬â„¢ index) to the input columns (the seriesÃ¢â‚¬â„¢ values). We can therefore easily select all output features generated from a given input feature, such as in this case for embarked.

In [None]:
embarked_type_derivatives = preprocessing_df.features_original == "embarked"
transformed_df.loc[:, embarked_type_derivatives].head()

# Completing the pipeline with a classifier

Scikit-learn regressors and classifiers have a `sklearndf` sibling obtained by appending DF to the class name; the API remains the same. The result of any predict and decision function will be returned as a pandas series (single output) or data frame (class probabilities or multi-output).

We can combine the preprocessing pipeline above with a classifier to create a full predictive pipeline. sklearndf provides two useful, specialised pipeline objects for this, `RegressorPipelineDF` and `ClassifierPipelineDF`. Both implement a special two-step pipeline with one pre-processing step and one prediction step, while staying compatible with the general sklearn pipeline idiom.

Using `ClassifierPipelineDF` we can combine the preprocessing pipeline with `RandomForestClassifierDF()` to fit a model to a selected training set and then score and a test set.

In [None]:
from facet.simulation import UnivariateUpliftSimulator
from facet.simulation.partition import ContinuousRangePartitioner
from facet.simulation.viz import SimulationDrawer

SIM_FEAT = "LSTAT"
simulator = UnivariateUpliftSimulator(crossfit = ranker.best_model_crossfit, n_jobs=3)

# Split the simulation range into equal sized partitions
partitioner = ContinuousRangePartitioner()

simulation = simulator.simulate_feature(name=SIM_FEAT, partitioner = partitioner)

SimulationDrawer().draw(
    data=simulation, title=SIM_FEAT
)