# Generate Summary/Analysis Tables

## Set Globals

In [None]:
# Libraries/packages
import sys

sys.path.append("../")
from src.data_utils import get_feature_lists
from src.config import SEED, BASE_PATH
from src.summary_analysis import (
    generate_summary_table,
    get_analysis_df,
    generate_fish_list,
)
import warnings
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
import pandas as pd

In [None]:
# Data
df = pd.read_excel(
    BASE_PATH / "data" / "processed" / "fully_cleaned_tongue_data.xlsx", index_col=0
)
# Outcomes
outcome_df = pd.read_excel(
    BASE_PATH / "data" / "processed" / "Outcome_df.xlsx", index_col=0
)

OUTCOME_DICT = {}
for col in outcome_df:
    OUTCOME_DICT[col] = outcome_df[col]

outcome_sub_cols = {
    "Surgical_Outcome": [
        "SUPINFEC",  # Superficial infection
        "WNDINFD",  # Deep infection
        "ORGSPCSSI",  # Organ space infection
        "DEHIS",
    ],
    "Bleed_Outcome": "OTHBLEED",
    "Aspiration_Outcome": [
        "OUPNEUMO",  # Pnemonia
        "REINTUB",  # Reintubation
        "FAILWEAN",
    ],
    "Mortality_Outcome": "Mortality",
    "ReOp_Outcome": "UNPLREOP",
}

## More Data Prep

### General columns (only for summary + statistical analysis)

In [None]:
## Any surgical complication
df["ANY SURGICAL COMPLICATION"] = OUTCOME_DICT["Surgical_Outcome"].values.ravel()

# Any Medical Complication
##Create any medical comp cols
any_medical_comp_cols = [
    "OTHDVT",
    "PULEMBOL",
    "OPRENAFL",  # Post-op Dialysis
    "RENAINSF",  # Post-op renal insufficiency
    "URNINFEC",  # UTI
    "CNSCVA",  # Stroke
    "CDARREST",
    "CDMI",  # Myocardial infarction
    "OTHSYSEP",  # Post-op Sepsis
    "OTHSESHOCK",  # Septic shock
]
df["ANY MEDICAL COMPLICATION"] = (
    (df[any_medical_comp_cols] == "Yes").any(axis=1).astype(int)
)
# Any Aspiration Outcome
df["ANY ASPIRATION COMPLICATION"] = OUTCOME_DICT["Aspiration_Outcome"].values.ravel()
## Calculate BMI
df["BMI"] = (df["WEIGHT"] / (df["HEIGHT"] ** 2)) * 703
df.drop(["WEIGHT", "HEIGHT"], axis=1, inplace=True)

Re-order a bit

In [None]:
reordered_cols = [
    # Pre-op
    "SEX",  # Nominal
    "RACE_NEW",  # Nominal
    "ETHNICITY_HISPANIC",  # Binary
    "INOUT",  # Binary
    "Age",  # Numerical
    "URGENCY",  # Nominal
    # "HEIGHT",  # Numerical
    # "WEIGHT",  # Numerical
    "BMI",
    "DIABETES",  # Binary
    "SMOKE",  # Binary
    "DYSPNEA",  # Binary
    "FNSTATUS2",
    "VENTILAT",
    "HXCOPD",
    "ASCITES",
    "HXCHF",
    "HYPERMED",
    "RENAFAIL",
    "DIALYSIS",
    "DISCANCR",
    "WNDINF",
    "STEROID",
    "WTLOSS",
    "BLEEDDIS",
    "TRANSFUS",
    "PRSEPIS",
    "PRALBUM",
    "PRWBC",
    "PRHCT",
    "PRPLATE",
    "ASACLAS",
    # Intra-op
    "OPTIME",
    "Partial Glossectomy (Hemiglossectomy_Subtotal)",
    "Composite_Extended Glossectomy",
    "Total Glossectomy (Complete Tongue Removal)",
    "Excision of Tongue Lesions (Minor)",
    "Local_Regional Tissue Flaps for Oral Cavity Reconstruction",
    "Free Tissue Transfer (Microvascular Free Flaps) and Complex Flap Reconstruction",
    "Skin Autografts for Head and Neck Reconstruction",
    "Neck Dissection and Lymphadenectomy Procedures",
    "Alveolar Ridge and Gingival Procedures",
    "Mandibular Resection and Reconstruction Procedures",
    "Peripheral Nerve Repair and Neuroplasty",
    "Tracheostomy Procedures",
    "Gastrostomy and Esophageal Access Procedures",
    "Submandibular Gland Excision",
    "Parotid Gland Excision",
    "Laryngeal Resection and Reconstruction Procedures",
    "Pharyngeal Resection and Reconstruction Procedures",
    "Tonsillectomy and Tonsillar Region Procedures",
    "Malignant neoplasm",
    "OPERYR",
    ##Post-op
    "TOTHLOS",
    # Optime already included (intra-op)
    "Mortality",
    "UNPLREOP",
    "READ",
    "UNPLREAD",
    # Func status already included (pre-op)
    "DISCHDEST",
    ### Any surgical complication
    "Any Surgical Complication",
    "SUPINFEC",
    "WNDINFD",  # Deep infection
    "ORGSPCSSI",  # Organ space infection
    "DEHIS",  # Dehisence
    ### OTHBLEED is separate
    "OTHBLEED",  # Bleeding
    ## Any Aspiration Complication
    "Any Aspiration Complication",
    "FAILWEAN",  # Ventilator >48
    "REINTUB",  # Reintubation
    "OUPNEUMO",  # Pnemobnia
    ### Medical complications
    "Any Medical Complication",
    "OTHDVT",
    "PULEMBOL",
    "OPRENAFL",  # Post-op Dialysis
    "RENAINSF",  # Post-op renal insufficiency
    "URNINFEC",  # UTI
    "CNSCVA",  # Stroke
    "CDARREST",
    "CDMI",  # Myocardial infarction
    "OTHSYSEP",  # Post-op Sepsis
    "OTHSESHOCK",  # Septic shock
]

reordered_cols_cap = [col.upper() for col in reordered_cols]
ordered_df = df[reordered_cols_cap].copy()

In [None]:
# Classify features
FEATURE_DICT = get_feature_lists(ordered_df)
binary_cols = FEATURE_DICT["binary_cols"]
numerical_cols = FEATURE_DICT["numerical_cols"]
nominal_cols = FEATURE_DICT["nominal_cols"]
ordinal_cols = FEATURE_DICT["ordinal_cols"]
## Make Year an ordinal variable instead of numerical --> just for this portion of proj
numerical_cols.remove("OPERYR")
ordinal_cols.append("OPERYR")
FEATURE_DICT["numerical_cols"] = numerical_cols
FEATURE_DICT["ordinal_cols"] = ordinal_cols

Impute

In [None]:
## Impute
imputer = IterativeImputer(
    estimator=None,  # default = BayesianRidge
    initial_strategy="median",
    max_iter=10,
    sample_posterior=False,  # deterministic
)
df_impute = ordered_df.copy()
imputed_values = imputer.fit_transform(df_impute[numerical_cols])
df_impute[numerical_cols] = imputed_values

## Summary + Analysis

In [None]:
# Get features w/ expected freq < 5
fish_dict = generate_fish_list(
    df_impute, OUTCOME_DICT, FEATURE_DICT["binary_cols"], verbose=False
)

all_categories = {}
for col in nominal_cols + binary_cols + ordinal_cols:
    ## Dict: {column_names: [<unique_entries>]}
    all_categories[col] = df_impute[col].unique()

final_tables = []
for i, (outcome_name, outcome) in enumerate(OUTCOME_DICT.items()):
    print(f"Working on {outcome_name}...")
    ##Get summary
    summary_df = generate_summary_table(
        X_df_final=df_impute,
        X_df_og=ordered_df,
        outcome_data=outcome,
        outcome_name=outcome_name,
        all_categories=all_categories,
        outcome_sub_cols=outcome_sub_cols,
    )
    # Df containing univariable values (p-values, ORs w/ CIs)
    analysis_df = get_analysis_df(
        df=df_impute,
        outcome_data=outcome,
        outcome_name=outcome_name,
        outcome_sub_cols=outcome_sub_cols,
        fish_dict=fish_dict,
    )
    try:
        assert set(analysis_df.index.to_list()) == set(summary_df.index.to_list())
    except AssertionError:
        print(set(analysis_df.index.to_list()) - set(summary_df.index.to_list()))
        print(set(summary_df.index.to_list()) - set(analysis_df.index.to_list()))
        raise AssertionError("Analysis and summary tables DO NOT match!")
    ## Append with summary
    final_table = summary_df.join(analysis_df, how="left").fillna("")
    # On first iteration, save the "all patients" column
    if i == 0:
        all_patients_col = final_table.iloc[:, 0].copy()  # first column
        n_total = len(df_impute)
        all_patients_col.name = f"Total Patients (n={n_total})"
        final_tables.append(all_patients_col)
    # Append only the outcome-specific columns (positive/negative)
    outcome_specific = final_table.iloc[:, 1:]  # all columns except first
    final_tables.append(outcome_specific)
wide_table = pd.concat(final_tables, axis=1)

Export

In [None]:
export_path = BASE_PATH / "results" / "tables" / "AllAnalysisTables.xlsx"
if export_path.exists():
    export_path.unlink()
    warnings.warn(f"Over-writing table at path {export_path}")
export_path.parent.mkdir(exist_ok=True, parents=True)
wide_table.to_excel(export_path, index=True)