# Pre-processing for ML

## Set up

Import data/packages/libraries

In [None]:
import sys

sys.path.append("../")
from src.data_utils import get_feature_lists
from src.config import SEED, BASE_PATH
from src.preprocess import BMICalculatorArray, transform_export_data, clip_and_round_asa
import pandas as pd
import warnings
import numpy as np

# Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import (
    OneHotEncoder,
    OrdinalEncoder,
    MinMaxScaler,
    FunctionTransformer,
)
from sklearn.pipeline import Pipeline
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

In [None]:
import_df = pd.read_parquet(
    BASE_PATH / "data" / "raw" / "cleaned" / "NSQIP_mast_combined.parquet"
)
outcome_df = pd.read_parquet(BASE_PATH / "data" / "processed" / "outcome_df.parquet")

Reformat outcomes

Subset DF for modeling

In [None]:
## Subset
keep_cols = [
    # Pre-Op
    "AGE",
    "HEIGHT",
    "WEIGHT",
    "SEX",
    "ETHNICITY_HISPANIC",  # added
    "RACE",
    "DIABETES",
    "HXCOPD",
    "HXCHF",
    "ASCITES",
    "BLEEDDIS",
    "TRANSFUS",
    "DIALYSIS",
    "HYPERMED",
    "VENTILAT",
    "SMOKE",
    "DISCANCR",
    "RENAFAIL",
    "STEROID",
    "ASACLAS",
    "PRALBUM",  # added
    "PRWBC",  # added
    "PRHCT",  # added
    "PRPLATE",  # added
    "DYSPNEA",
    "WNDINF",
    "WTLOSS",
    # Surgical Characteristics
    "OPTIME",
    "SURGINDICD",
    "SNLBCPT",
    "ALNDCPT",
    "PARTIALCPT",
    "SUBSIMPLECPT",
    "RADICALCPT",
    "MODIFIEDRADICALCPT",
    "IMMEDIATECPT",
    "DELAYEDCPT",
    "TEINSERTIONCPT",
    "TEEXPANDERCPT",
    "FREECPT",
    "LATCPT",
    "SINTRAMCPT",
    "SINTRAMSUPERCPT",
    "BITRAMCPT",
    "MASTOCPT",
    "BREASTREDCPT",
    "FATGRAFTCPT",
    "ADJTISTRANSCPT",
    "AUGPROSIMPCPT",
    "OTHERRECONTECHCPT",
    "REVRECBREASTCPT",
    "NPWTCPT",
    "URGENCY",
    "ANESTHES",
    "SURGSPEC",
    "INOUT",
    "OPERYR",
]
df_sub = import_df[keep_cols].copy()

Get feature lists

In [None]:
feat_list_dict = get_feature_lists(df_sub)
binary_cols = feat_list_dict["binary_cols"]
nominal_cols = feat_list_dict["nominal_cols"]
ordinal_cols = feat_list_dict["ordinal_cols"]
numerical_cols = feat_list_dict["numerical_cols"]

## Create Pipeline

In [None]:
################# Numerical pipeline #################
# ====>Impute, calculate BMI, then scale
height_idx = numerical_cols.index("HEIGHT")
weight_idx = numerical_cols.index("WEIGHT")
# Only age is ~normal --> best to use this over StandardScaler()
num_pipeline = Pipeline(
    [
        (
            "imputer",
            IterativeImputer(
                estimator=None,  # default = BayesianRidge
                initial_strategy="median",
                max_iter=10,
                sample_posterior=False,  # deterministic
            ),
        ),
        ("bmi", BMICalculatorArray(height_idx=height_idx, weight_idx=weight_idx)),
        ("scaler", MinMaxScaler()),
    ]
)
################# Ordinal pipeline #################
# ==============> Separate imputer/encoder for ASA
asa_col = ["ASACLAS"]
# asa_pipeline = Pipeline([("encoder", OrdinalEncoder(categories=[[1, 2, 3, 4]]))])

asa_pipeline = Pipeline(
    steps=[
        # 1. Imputation on the ASA column
        (
            "imputer",
            IterativeImputer(
                estimator=None,  # default = BayesianRidge
                initial_strategy="median",
                max_iter=10,
                sample_posterior=False,  # deterministic
            ),
        ),
        # 2. Round to nearest integer, cast to int
        (
            "round_to_int",
            FunctionTransformer(
                clip_and_round_asa,
                feature_names_out="one-to-one",
            ),
        ),
        # 3. Ordinal encoding just in case
        (
            "encoder",
            OrdinalEncoder(categories=[[1, 2, 3, 4]]),
        ),
    ]
)

# ==============> Separate encoder for all other ordinals (0, 1, 2+)
other_ordinal_cols = [col for col in ordinal_cols if col != "ASACLAS"]
num_other_ordinal = len(other_ordinal_cols)
other_ordinal_pipeline = Pipeline(
    [
        (
            "encoder",
            OrdinalEncoder(
                categories=[["0", "1", "2+"]]
                * num_other_ordinal  # Repeat for each column
            ),
        )
    ]
)
################# Nominal pipeline #################
# =========> One-hot encode
nom_pipeline = Pipeline([("encoder", OneHotEncoder(handle_unknown="ignore"))])

################# Combine all preprocessing #################
preprocessor = ColumnTransformer(
    [
        ("num", num_pipeline, numerical_cols),
        ("cat", nom_pipeline, nominal_cols),
        ("ord_asa", asa_pipeline, asa_col),
        ("ord_other", other_ordinal_pipeline, other_ordinal_cols),
        ("bin", "passthrough", binary_cols),
    ]
)

Transform

In [None]:
data_path = BASE_PATH / "data" / "processed"
pipeline_path = BASE_PATH / "data" / "pipelines"
split_outcome_data = {}
for outcome_name in outcome_df.columns:
    print(f"{outcome_name}...")
    match outcome_name:
        case "AnyMedComp":
            outcome_name_simplified = "med_outcome"
        case "AnySurgComp":
            outcome_name_simplified = "surg_outcome"
        case "MORTALITY":
            outcome_name_simplified = "mort_outcome"
        case "UNPLREOP":
            outcome_name_simplified = "reop_outcome"
        case "VTE":
            outcome_name_simplified = "vte_outcome"
    split_outcome_data[outcome_name] = transform_export_data(
        X=df_sub,
        y=outcome_df[outcome_name],
        outcome_name=outcome_name_simplified,  # this is only used for dir to write data to
        preprocessor=preprocessor,
        data_path=data_path,
        pipeline_path=pipeline_path,
    )