# Data Pre-processing

## Set globals + initial cleaning

In [None]:
import sys

sys.path.append("../")
from src.data_utils import get_feature_lists
from src.config import SEED, BASE_PATH
from pathlib import Path
from shutil import rmtree

import pandas as pd
import numpy as np

# Pipeline
from imblearn.over_sampling import SMOTENC
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.model_selection import train_test_split

outcome_df = pd.read_excel(BASE_PATH / "data" / "processed" / "Outcome_df.xlsx")
X_df = pd.read_excel(
    BASE_PATH / "data" / "processed" / "fully_cleaned_tongue_data.xlsx"
)

In [None]:
x_cols = [
    # Pre-op
    "SEX",  # Nominal
    "RACE_NEW",  # Nominal
    "ETHNICITY_HISPANIC",  # Binary
    "INOUT",  # Binary
    "Age",  # Numerical
    "ELECTSURG",  # Nominal
    "HEIGHT",  # Numerical
    "WEIGHT",  # Numerical
    "DIABETES",  # Binary
    "SMOKE",  # Binary
    "DYSPNEA",  # Binary
    "FNSTATUS2",
    "VENTILAT",
    "HXCOPD",
    "ASCITES",
    "HXCHF",
    "HYPERMED",
    "RENAFAIL",
    "DIALYSIS",
    "DISCANCR",
    "WNDINF",
    "STEROID",
    "WTLOSS",
    "BLEEDDIS",
    "TRANSFUS",
    "PRSEPIS",
    "PRALBUM",
    "PRWBC",
    "ASACLAS",
    # Intra-op
    "OPTIME",
    "Partial Glossectomy (Hemiglossectomy_Subtotal)",
    "Composite_Extended Glossectomy",
    "Total Glossectomy (Complete Tongue Removal)",
    "Excision of Tongue Lesions (Minor)",
    "Local_Regional Tissue Flaps for Oral Cavity Reconstruction",
    "Free Tissue Transfer (Microvascular Free Flaps) and Complex Flap Reconstruction",
    "Skin Autografts for Head and Neck Reconstruction",
    "Neck Dissection and Lymphadenectomy Procedures",
    "Alveolar Ridge and Gingival Procedures",
    "Mandibular Resection and Reconstruction Procedures",
    "Peripheral Nerve Repair and Neuroplasty",
    "Tracheostomy Procedures",
    "Gastrostomy and Esophageal Access Procedures",
    "Submandibular Gland Excision",
    "Parotid Gland Excision",
    "Laryngeal Resection and Reconstruction Procedures",
    "Pharyngeal Resection and Reconstruction Procedures",
    "Tonsillectomy and Tonsillar Region Procedures",
    "Malignant neoplasm",
]

X_sub = X_df[x_cols].copy()
X_sub.shape

## Pipeline

Initialize feature types

In [None]:
##Imported func from src
feature_lists = get_feature_lists(X_sub)
binary_cols = feature_lists["binary_cols"]
numerical_cols = feature_lists["numerical_cols"]
nominal_cols = feature_lists["nominal_cols"]
ordinal_cols = feature_lists["ordinal_cols"]
X_sub["FNSTATUS2"] = (
    X_sub["FNSTATUS2"].replace({"Independent": "1", "Dependent": "0"}).astype(int)
)
X_sub["SEX"] = X_sub["SEX"].replace({"male": "1", "female": "0"}).astype(int)
X_sub[binary_cols] = (
    X_sub[binary_cols]
    .replace({"Yes": "1", "No": "0", "No_Unknown": "0"})
    .apply(pd.to_numeric)
)

len(binary_cols) + len(nominal_cols) + len(ordinal_cols) + len(
    numerical_cols
) == X_sub.shape[1]

Construct Pipeline

In [None]:
class BMICalculatorArray(BaseEstimator, TransformerMixin):
    def __init__(self, height_idx, weight_idx):
        self.height_idx = height_idx
        self.weight_idx = weight_idx

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()
        height = X[:, self.height_idx]
        weight = X[:, self.weight_idx]
        bmi = (weight * 703) / (height**2)
        # Remove height and weight columns
        mask = np.ones(X.shape[1], dtype=bool)
        mask[[self.height_idx, self.weight_idx]] = False
        X_new = X[:, mask]
        # Append BMI as last column
        X_new = np.column_stack([X_new, bmi])
        return X_new.astype(np.float32)

    def get_feature_names_out(self, input_features=None):
        # Remove height and weight, add BMI
        if input_features is None:
            input_features = [
                f"num_{i}" for i in range(self.height_idx + self.weight_idx + 1)
            ]
        input_features = list(input_features)
        # Remove height and weight
        features = [
            f
            for i, f in enumerate(input_features)
            if i not in [self.height_idx, self.weight_idx]
        ]
        features.append("BMI")
        return np.array(features)


def remove_prefix(df):
    X = df.copy()
    X.columns = X.columns.str.replace(r"^\w+__", "", regex=True)
    return X


# wnd_class_order = ['1-Clean', '2-Clean/Contaminated', '3-Contaminated', '4-Dirty/Infected']
asa_class_order = [
    "1-No Disturb",
    "2-Mild Disturb",
    "3-Severe Disturb",
    "4-Life Threat",
]

height_idx = numerical_cols.index("HEIGHT")
weight_idx = numerical_cols.index("WEIGHT")

# Numerical pipeline: Impute, calculate BMI, then scale
num_pipeline = Pipeline(
    [
        ("imputer", IterativeImputer(random_state=SEED, sample_posterior=True)),
        ("bmi", BMICalculatorArray(height_idx=height_idx, weight_idx=weight_idx)),
        ("scaler", MinMaxScaler()),
    ]
)

# Categorical pipeline: OneHotEncode
nom_pipeline = Pipeline([("encoder", OneHotEncoder(handle_unknown="ignore"))])

# Ordinal Pipeline for asa class
asa_pipeline = Pipeline(
    [
        (
            "encoder",
            OrdinalEncoder(
                categories=[asa_class_order],
                handle_unknown="use_encoded_value",
                unknown_value=-1,
            ),
        )
    ]
)

# Combine all preprocessing
preprocessor = ColumnTransformer(
    [
        ("num", num_pipeline, numerical_cols),
        ("cat", nom_pipeline, nominal_cols),
        # ('ord_wnd', wnd_pipeline, [ordinal_cols[0]]),
        ("ord_asa", asa_pipeline, [ordinal_cols[0]]),
        ("bin", "passthrough", binary_cols),
    ]
)

Transform

In [None]:
def transform_export_data(X, y, sample_ratio, file_path=None):
    ##Get train set
    X_train, X_temp, y_train, y_temp = train_test_split(
        X, y, test_size=0.3, random_state=SEED, stratify=y
    )
    ##Get val + test set
    X_val, X_test, y_val, y_test = train_test_split(
        X_temp, y_temp, test_size=0.5, random_state=SEED, stratify=y_temp
    )

    preprocessor.fit(X_train)
    feature_names = preprocessor.get_feature_names_out()

    X_train_transformed = np.array(preprocessor.transform(X_train))
    X_train_transformed = pd.DataFrame(X_train_transformed, columns=feature_names)
    X_train_transformed = remove_prefix(X_train_transformed)

    X_val_transformed = np.array(preprocessor.transform(X_val))
    X_val_transformed = pd.DataFrame(X_val_transformed, columns=feature_names)
    X_val_transformed = remove_prefix(X_val_transformed)

    X_test_transformed = np.array(preprocessor.transform(X_test))
    X_test_transformed = pd.DataFrame(X_test_transformed, columns=feature_names)
    X_test_transformed = remove_prefix(X_test_transformed)

    # Reset index
    X_train_transformed.reset_index(drop=True, inplace=True)
    y_train.reset_index(drop=True, inplace=True)
    X_val_transformed.reset_index(drop=True, inplace=True)
    y_val.reset_index(drop=True, inplace=True)
    X_test_transformed.reset_index(drop=True, inplace=True)
    y_test.reset_index(drop=True, inplace=True)

    for col in X_train_transformed.columns:
        try:
            X_train_transformed[col] = pd.to_numeric(X_train_transformed[col])
        except Exception as e:
            print(f"Column {col} failed: {e}")

    for col in X_val_transformed.columns:
        try:
            X_val_transformed[col] = pd.to_numeric(X_val_transformed[col])
        except Exception as e:
            print(f"Column {col} failed: {e}")

    for col in X_test_transformed.columns:
        try:
            X_val_transformed[col] = pd.to_numeric(X_val_transformed[col])
        except Exception as e:
            print(f"Column {col} failed: {e}")

    ######### CREATE OVER SAMPLED SET ##########
    og_nom_cols = get_feature_lists(X_train_transformed)["binary_cols"]
    categorical_indices = [
        i for i, c in enumerate(X_train_transformed.columns) if c in og_nom_cols
    ]
    smote_nc = SMOTENC(
        categorical_features=categorical_indices,
        sampling_strategy=sample_ratio,
        random_state=SEED,
    )
    train_res = smote_nc.fit_resample(X_train_transformed.values, y_train.values)
    X_train_resampled, y_train_resampled = train_res[:2]
    X_train_resampled = pd.DataFrame(
        X_train_resampled, columns=X_train_transformed.columns
    )
    y_train_resampled = pd.Series(np.array(y_train_resampled))
    ######### CREATE OVER SAMPLED SET ##########

    if file_path:
        if file_path.exists():
            rmtree(file_path)
        file_path.mkdir(exist_ok=False, parents=True)
        X_train_transformed.to_parquet(file_path / "X_train.parquet")
        y_train.to_excel(file_path / "y_train.xlsx")
        X_train_resampled.to_parquet(file_path / "X_train_res.parquet")
        y_train_resampled.to_excel(file_path / "y_train_res.xlsx")
        X_val_transformed.to_parquet(file_path / "X_val.parquet")
        y_val.to_excel(file_path / "y_val.xlsx")
        X_test_transformed.to_parquet(file_path / "X_test.parquet")
        y_test.to_excel(file_path / "y_test.xlsx")

    return {
        "X_train": X_train_transformed,
        "y_train": y_train,
        "X_train_res": X_train_resampled,
        "y_train_res": y_train_resampled,
        "X_val": X_val_transformed,
        "y_val": y_val,
        "X_test": X_test_transformed,
        "y_test": y_test,
    }


data_path = BASE_PATH / "data" / "processed"
# 8.4% w/o over sampling
# AKA 0.09 pos/neg
surg_data = transform_export_data(
    X_sub, outcome_df["Surgical_Outcome"], 0.27, data_path / "outcome_surg"
)
# 10% w/o over sampling
# AKA 0.11 pos/neg
bleed_data = transform_export_data(
    X_sub, outcome_df["Bleed_Outcome"], 0.33, data_path / "outcome_bleed"
)
# 7.05% w/o over sampling
# AKA 0.07 pos/neg
asp_data = transform_export_data(
    X_sub, outcome_df["Aspiration_Outcome"], 0.21, data_path / "outcome_asp"
)
# 0.9% w/o over sampling
# AKA 0.01 pos/neg
X_sub_mort = X_sub.drop("WNDINF", axis=1)
mort_data = transform_export_data(
    X_sub, outcome_df["Mortality_Outcome"], 0.1, data_path / "outcome_mort"
)

# outcome_data = {
#     'Surgical Wound Complication': surg_data,
#     'Bleed': bleed_data,
#     'Aspiration Complications': asp_data,
#     'Mortality': mort_data
# }