# Get Summary + Analysis Tables

## Import + set globals

In [None]:
import sys
import os
import warnings

sys.path.append(os.path.abspath("../"))
from src.config import BASE_PATH, SEED
from src.data_utils import get_feature_lists
from src.summary_analysis import (
    generate_fish_list,
    get_analysis_df,
    generate_summary_table,
)

import pandas as pd
from tqdm.notebook import tqdm
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

In [None]:
import_df = pd.read_parquet(
    BASE_PATH / "data" / "raw" / "cleaned" / "NSQIP_mast_combined.parquet"
)
outcome_df = pd.read_parquet(BASE_PATH / "data" / "processed" / "outcome_df.parquet")

## More outcome handling
OUTCOME_DICT = {}
for col in outcome_df:
    OUTCOME_DICT[col] = outcome_df[col]

outcome_sub_cols = {
    "AnySurgComp": ["SUPINFEC", "WNDINFD", "ORGSPCSSI", "DEHIS", "OTHBLEED"],
    "MORTALITY": [],
    "UNPLREOP": [],
    "VTE": ["OTHDVT", "PULEMBOL"],
    "AnyMedComp": [
        "OUPNEUMO",
        "REINTUB",
        "FAILWEAN",
        "URNINFEC",
        "CNSCVA",
        "CDARREST",
        "CDMI",
        "OTHSYSEP",
        "OTHSESHOCK",
        "RENAINSF",
        "OPRENAFL",
    ],
}

## Summary Cleaning

BMI

In [None]:
import_df["BMI"] = (import_df["WEIGHT"] / (import_df["HEIGHT"] ** 2)) * 703
import_df = import_df.drop(["WEIGHT", "HEIGHT"], axis=1)

Re-order + remove CPT/ICD code columns

In [None]:
reorder_cols_list = [
    ############ TABLE 1 ############
    ## Demographics
    "AGE",
    "BMI",
    "SEX",
    "RACE",
    "ETHNICITY_HISPANIC",
    ## Pre-op health + comorbidities
    "DIABETES",
    "HXCOPD",
    "HXCHF",
    "ASCITES",
    "BLEEDDIS",
    "TRANSFUS",
    "DIALYSIS",
    "HYPERMED",
    "VENTILAT",
    "SMOKE",
    "DISCANCR",
    "STEROID",
    "ASACLAS",
    "PRALBUM",
    "PRWBC",
    "PRHCT",
    "PRPLATE",
    "RENAFAIL",
    "DYSPNEA",
    "WNDINF",
    "WTLOSS",
    ############ TABLE 2 ############
    "OPTIME",
    ## Surgical Indication (ICD)
    "SURGINDICD",
    ## Lymph Node Surgery
    "SNLBCPT",
    "ALNDCPT",
    "NOLYMPH",
    ## Type of Mastectomy
    "PARTIALCPT",
    "SUBSIMPLECPT",
    "RADICALCPT",
    "MODIFIEDRADICALCPT",
    ## Procedural Anatomy
    "PROCANATCPT",
    ## Reconstruction CPT CODES
    "IMMEDIATECPT",
    "DELAYEDCPT",
    "TEINSERTIONCPT",
    "TEEXPANDERCPT",
    "FREECPT",
    "LATCPT",
    "SINTRAMCPT",
    "SINTRAMSUPERCPT",
    "BITRAMCPT",
    "MASTOCPT",
    "BREASTREDCPT",
    "FATGRAFTCPT",
    "ADJTISTRANSCPT",
    "AUGPROSIMPCPT",
    "OTHERRECONTECHCPT",
    "REVRECBREASTCPT",
    ## Timing of Surgery
    "SURGTIMINGCPT",
    ## NPWT
    "NPWTCPT",
    ## Other stuff
    "URGENCY",
    "SURGSPEC",
    "ANESTHES",
    "INOUT",
    "OPERYR",
    ############ TABLE 3 ############
    "TOTHLOS",
    "MORTALITY",
    "UNPLREOP",
    "READ",
    "UNPLREAD",
    ### Surgical Complications
    "AnySurgComp",
    "SUPINFEC",
    "WNDINFD",
    "ORGSPCSSI",
    "DEHIS",
    "OTHBLEED",
    ## VTE Complications
    "VTE",
    "OTHDVT",
    "PULEMBOL",
    ## non-VTE Medical Complications
    "AnyMedComp",
    "OUPNEUMO",
    "REINTUB",
    "FAILWEAN",
    "URNINFEC",
    "CNSCVA",
    "CDARREST",
    "CDMI",
    "OTHSYSEP",
    "OTHSESHOCK",
    "RENAINSF",
    "OPRENAFL",
]

ordered_df = import_df[reorder_cols_list].copy()
## Export
final_cleaned_path = BASE_PATH / "data" / "raw" / "cleaned" / "final_cleaned.parquet"
if final_cleaned_path.exists():
    final_cleaned_path.unlink()
ordered_df.to_parquet(final_cleaned_path)

Impute numerical

In [None]:
## Get feature lists
FEATURE_DICT = get_feature_lists(ordered_df)
numerical_cols = FEATURE_DICT["numerical_cols"]
nominal_cols = FEATURE_DICT["nominal_cols"]
ordinal_cols = FEATURE_DICT["ordinal_cols"]
binary_cols = FEATURE_DICT["binary_cols"]
## Make Year an ordinal variable instead of numerical --> just for this portion of proj
numerical_cols.remove("OPERYR")
ordinal_cols.append("OPERYR")
FEATURE_DICT["numerical_cols"] = numerical_cols
FEATURE_DICT["ordinal_cols"] = ordinal_cols
## Impute
impute_cols = numerical_cols + ["ASACLAS"]
imputer = IterativeImputer(
    estimator=None,  # default = BayesianRidge
    initial_strategy="median",
    max_iter=10,
    sample_posterior=False,  # deterministic
)

df_impute = ordered_df.copy()
imputed_values = imputer.fit_transform(df_impute[impute_cols])
df_impute[impute_cols] = imputed_values
## Round ASACLASS imputations back to whole numbers
df_impute["ASACLAS"] = df_impute["ASACLAS"].round(0).astype(float)
assert df_impute.isna().sum().sum() == 0

Export DF used for stats

In [None]:
stats_df_path = BASE_PATH / "data" / "processed" / "df_used_for_sum_analysis.parquet"
if stats_df_path.exists():
    stats_df_path.unlink()
stats_df_path.parent.mkdir(exist_ok=True, parents=True)
df_impute.to_parquet(stats_df_path)

## Run Summary + Analysis

In [None]:
# Get features w/ expected freq < 5
fish_dict = generate_fish_list(
    df_impute, OUTCOME_DICT, FEATURE_DICT["binary_cols"], verbose=False
)

all_categories = {}
for col in nominal_cols + binary_cols + ordinal_cols:
    ## Dict: {column_names: [<unique_entries>]}
    all_categories[col] = df_impute[col].unique()

final_tables = []
for i, (outcome_name, outcome) in enumerate(
    tqdm(OUTCOME_DICT.items(), desc="Outcomes")
):
    print(f"Working on {outcome_name}...")
    ##Get summary
    summary_df = generate_summary_table(
        X_df_final=df_impute,
        X_df_og=ordered_df,
        outcome_data=outcome,
        outcome_name=outcome_name,
        all_categories=all_categories,
        outcome_sub_cols=outcome_sub_cols,
    )
    # Df containing univariable values (p-values, ORs w/ CIs)
    analysis_df = get_analysis_df(
        df=df_impute,
        outcome_data=outcome,
        outcome_name=outcome_name,
        outcome_sub_cols=outcome_sub_cols,
        fish_dict=fish_dict,
    )
    try:
        assert set(analysis_df.index.to_list()) == set(summary_df.index.to_list())
    except AssertionError:
        print(set(analysis_df.index.to_list()) - set(summary_df.index.to_list()))
        print(set(summary_df.index.to_list()) - set(analysis_df.index.to_list()))
        raise AssertionError("Analysis and summary tables DO NOT match!")
    ## Append with summary
    final_table = pd.concat([summary_df, analysis_df], axis=1)
    # On first iteration, save the "all patients" column
    if i == 0:
        all_patients_col = final_table.iloc[:, 0].copy()  # first column
        n_total = len(df_impute)
        all_patients_col.columns = [f"Total Patients (n={n_total})"]
        final_tables.append(all_patients_col)
    # Append only the outcome-specific columns (positive/negative)
    outcome_specific = final_table.iloc[:, 1:]  # all columns except first
    final_tables.append(outcome_specific)
wide_table = pd.concat(final_tables, axis=1)

Export

In [None]:
export_path = BASE_PATH / "results" / "tables" / "summary_analysis.xlsx"
if export_path.exists():
    export_path.unlink()
    warnings.warn(f"Over-writing table at path {export_path}")
export_path.parent.mkdir(exist_ok=True, parents=True)
wide_table.to_excel(export_path, index=True)