# Data Pre-processing

## Set globals + initial cleaning

In [None]:
import sys

sys.path.append("../")
from src.data_utils import get_feature_lists
from src.config import SEED, BASE_PATH
import pandas as pd
import warnings

# Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from src.preprocess import BMICalculatorArray, transform_export_data

outcome_df = pd.read_excel(
    BASE_PATH / "data" / "processed" / "Outcome_df.xlsx", index_col=0
)
X_df = pd.read_excel(
    BASE_PATH / "data" / "processed" / "fully_cleaned_tongue_data.xlsx"
)

In [None]:
x_cols = [
    # Pre-op
    "SEX",  # Nominal
    "RACE_NEW",  # Nominal
    "ETHNICITY_HISPANIC",  # Binary
    "INOUT",  # Binary
    "Age",  # Numerical
    "URGENCY",  # Nominal
    "HEIGHT",  # Numerical
    "WEIGHT",  # Numerical
    "DIABETES",  # Binary
    "SMOKE",  # Binary
    "DYSPNEA",  # Binary
    "FNSTATUS2",
    "VENTILAT",
    "HXCOPD",
    "ASCITES",
    "HXCHF",
    "HYPERMED",
    "RENAFAIL",
    "DIALYSIS",
    "DISCANCR",
    "WNDINF",
    "STEROID",
    "WTLOSS",
    "BLEEDDIS",
    "TRANSFUS",
    "PRSEPIS",
    "PRALBUM",
    "PRWBC",
    "PRHCT",
    "PRPLATE",
    "ASACLAS",
    "OPERYR",
    # Intra-op
    "OPTIME",
    "Partial Glossectomy (Hemiglossectomy_Subtotal)",
    "Composite_Extended Glossectomy",
    "Total Glossectomy (Complete Tongue Removal)",
    "Excision of Tongue Lesions (Minor)",
    "Local_Regional Tissue Flaps for Oral Cavity Reconstruction",
    "Free Tissue Transfer (Microvascular Free Flaps) and Complex Flap Reconstruction",
    "Skin Autografts for Head and Neck Reconstruction",
    "Neck Dissection and Lymphadenectomy Procedures",
    "Alveolar Ridge and Gingival Procedures",
    "Mandibular Resection and Reconstruction Procedures",
    "Peripheral Nerve Repair and Neuroplasty",
    "Tracheostomy Procedures",
    "Gastrostomy and Esophageal Access Procedures",
    "Submandibular Gland Excision",
    "Parotid Gland Excision",
    "Laryngeal Resection and Reconstruction Procedures",
    "Pharyngeal Resection and Reconstruction Procedures",
    "Tonsillectomy and Tonsillar Region Procedures",
    "Malignant neoplasm",
]
x_cols_cap = [col.upper() for col in x_cols]
X_sub = X_df[x_cols_cap].copy()
X_sub.shape

## Pipeline

Initialize feature types

In [None]:
##Imported func from src
feature_lists = get_feature_lists(X_sub)
binary_cols = feature_lists["binary_cols"]
numerical_cols = feature_lists["numerical_cols"]
nominal_cols = feature_lists["nominal_cols"]
ordinal_cols = feature_lists["ordinal_cols"]

In [None]:
replace_dict = {
    "SEX": {"male": "1", "female": "0"},
    "URGENCY": {"Urgent_Emergent": "1", "Elective": "0"},
    "ETHNICITY_HISPANIC": {"noUnknown": "0", "Yes": "1"},
}
replace_w_binary_cols = [col for col in binary_cols if "Yes" in X_sub[col].unique()]
with warnings.catch_warnings():
    warnings.filterwarnings(
        "ignore", category=FutureWarning, message=".*Downcasting behavior.*"
    )
    X_sub = X_sub.replace(replace_dict).infer_objects(copy=False).copy()
    X_sub[replace_w_binary_cols] = (
        X_sub[replace_w_binary_cols]
        .replace({"Yes": 1, "No": 0})
        .infer_objects(copy=False)
        .copy()
    )
assert (
    len(binary_cols) + len(nominal_cols) + len(ordinal_cols) + len(numerical_cols)
    == X_sub.shape[1]
)

Construct Pipeline

In [None]:
# wnd_class_order = ['1-Clean', '2-Clean/Contaminated', '3-Contaminated', '4-Dirty/Infected']
asa_class_order = [
    "1-No Disturb",
    "2-Mild Disturb",
    "3-Severe Disturb",
    "4/5-Life Threat/Moribund",
]

height_idx = numerical_cols.index("HEIGHT")
weight_idx = numerical_cols.index("WEIGHT")

# Numerical pipeline: Impute, calculate BMI, then scale
num_pipeline = Pipeline(
    [
        (
            "imputer",
            IterativeImputer(
                estimator=None,  # default = BayesianRidge
                initial_strategy="median",
                max_iter=10,
                sample_posterior=False,  # deterministic
            ),
        ),
        ("bmi", BMICalculatorArray(height_idx=height_idx, weight_idx=weight_idx)),
        ("scaler", MinMaxScaler()),
    ]
)

# Categorical pipeline: OneHotEncode
nom_pipeline = Pipeline([("encoder", OneHotEncoder(handle_unknown="ignore"))])

# Ordinal Pipeline for asa class
asa_pipeline = Pipeline(
    [
        (
            "encoder",
            OrdinalEncoder(
                categories=[asa_class_order],
                handle_unknown="use_encoded_value",
                unknown_value=-1,
            ),
        )
    ]
)

# Combine all preprocessing
preprocessor = ColumnTransformer(
    [
        ("num", num_pipeline, numerical_cols),
        ("cat", nom_pipeline, nominal_cols),
        # ('ord_wnd', wnd_pipeline, [ordinal_cols[0]]),
        ("ord_asa", asa_pipeline, [ordinal_cols[0]]),
        ("bin", "passthrough", binary_cols),
    ]
)

Transform

In [None]:
data_path = BASE_PATH / "data" / "processed"
pipeline_path = BASE_PATH / "data" / "preprocessors"
# 8.4% positive distribution
surg_data = transform_export_data(
    X_sub,
    outcome_df["Surgical_Outcome"],
    "outcome_surg",
    preprocessor,
    data_path,
    pipeline_path,
)
# 10% positive distribution
bleed_data = transform_export_data(
    X_sub,
    outcome_df["Bleed_Outcome"],
    "outcome_bleed",
    preprocessor,
    data_path,
    pipeline_path,
)
# 7.05% positive distribution
asp_data = transform_export_data(
    X_sub,
    outcome_df["Aspiration_Outcome"],
    "outcome_asp",
    preprocessor,
    data_path,
    pipeline_path,
)
# 0.9% positive distribution
mort_data = transform_export_data(
    X_sub,
    outcome_df["Mortality_Outcome"],
    "outcome_mort",
    preprocessor,
    data_path,
    pipeline_path,
)
# 8.5% positive distribution
mort_data = transform_export_data(
    X_sub,
    outcome_df["ReOp_Outcome"],
    "outcome_reop",
    preprocessor,
    data_path,
    pipeline_path,
)