# Data Cleaning

In [11]:
import sys

sys.path.append("../")
from src.data_utils import get_feature_lists
from src.config import BASE_PATH

import pandas as pd
import numpy as np

df_import = pd.read_excel(
    BASE_PATH / "data" / "processed" / "CPT_ICD_Extracted_Cleaned.xlsx", index_col=0
)

## Data prep

In [None]:
### Drop CPT/ICD columns ###
drop_cols = [
    "CaseID",
    # CPT
    "CPT",
    "OTHERCPT1",
    "OTHERCPT2",
    "OTHERCPT3",
    "OTHERCPT4",
    "OTHERCPT5",
    "OTHERCPT6",
    "OTHERCPT7",
    "OTHERCPT8",
    "OTHERCPT9",
    "OTHERCPT10",
    "CONCPT1",
    "CONCPT2",
    "CONCPT3",
    "CONCPT4",
    "CONCPT5",
    "CONCPT6",
    "CONCPT7",
    "CONCPT8",
    "CONCPT9",
    "CONCPT10",
    "REOPORCPT1",
    # Podiag/ICD
    "PODIAG",
    "PODIAG10",
    "REOPOR1ICD91",
    "REOPOR1ICD101",
    "AdmYR",
    "SURGSPEC",
    "ANESTHES",
]

df_sub = df_import.drop(columns=drop_cols, axis=1)
print(f"Shape: {df_sub.shape}")
print("NAs:")
# for col in df_sub.columns:
#     num_na = df_sub[col].isna().sum()
#     if num_na > 0:
#         print(col)
#         print(num_na)

Fill NAs

CRITERIA: if feature present for all years, NA-->NO; otherwise NA-->UNKNOWN


In [None]:
### Fill NAs ###
# CRITERIA: if available all years--> NO otherwise UNKNOWN
fill_na_dict = {
    "RACE_NEW": "Unknown",
    "ETHNICITY_HISPANIC": "No_Unknown",
    "DISCHDEST": "Unknown",
    "ELECTSURG": "Unknown",
    "DYSPNEA": "Unknown",
    "RENAFAIL": "No",
    "WNDINF": "Unknown",
    "WTLOSS": "Unknown",
    "PRSEPIS": "No",
    "RENAINSF": "No Complication",
    "READMISSION1": "No",
    "UNPLANNEDREADMISSION1": "No",
}
df_sub.fillna(fill_na_dict, inplace=True)
##Ensure no more NA
assert df_sub.isna().sum().sum() == 0

## Clean Categorical Features

Normalize feature instance names


In [None]:
replace_dict = {
    "SEX": {"non-bi": "non-binary"},
    "RACE_NEW": {
        "Unknown/Not Reported": "Unknown_Other",
        "Unknown": "Unknown_Other",
        "Some Other Race": "Unknown_Other",
        "Native Hawaiian or Other Pacific Islander": "Unknown_Other",
        "American Indian or Alaska": "Unknown_Other",
        "Native Hawaiian or Pacifi": "Unknown_Other",
        "American Indian or Alaska Native": "Unknown_Other",
        "White,Black or African American": "Unknown_Other",
        "Native Hawaiian or Other Pacific Islander,Asian": "Unknown_Other",
        "Asian,Some Other Race": "Unknown_Other",
        "Native Hawaiian or Pacific Islander": "Unknown_Other",
        "American Indian or Alaskan Native": "Unknown_Other",
    },
    "ETHNICITY_HISPANIC": {
        "N": "No_Unknown",
        "No": "No_Unknown",
        "U": "No_Unknown",
        "Unknown": "No_Unknown",
        "Unk": "No_Unknown",
        "Y": "Yes",
    },
    "INOUT": {"Inpatient": "Yes", "Outpatient": "No"},
    "DISCHDEST": {
        "Home": "Home_Permanent Residence",
        "Unknown": "Unknown_Other Facility",
        "Facility Which was Home": "Home_Permanent Residence",
        "Separate Acute Care": "Acute Care",
        "Acute Care Hospital": "Acute Care",
        "Unskilled Facility Not Home": "Unknown_Other Facility",
        "Hospice": "Unknown_Other Facility",
        "Other Facility": "Unknown_Other Facility",
        "Against Medical Advice (AMA)": "Unknown_Other Facility",
    },
    "ELECTSURG": {
        "Yes": "Elective",
        "No": "Urgent_Emergent",
        "Urgent": "Urgent_Emergent",
        "Emergent": "Urgent_Emergent",
        "Unk": "Unknown",
    },
    "DIABETES": {
        "NO": "No",
        "NON-INSULIN": "Yes",
        "INSULIN": "Yes",
        "ORAL": "Yes",
    },
    "DYSPNEA": {
        "Unknown": "Unknown_Other",
        "MODERATE EXERTION": "Yes",
        "AT REST": "Yes",
    },
    "FNSTATUS2": {
        "Unknown": "Other_Unknown",
        "Partially Dependent": "Dependent",
        "Partially D": "Dependent",
        "Totally Dependent": "Dependent",
        "Totally Dep": "Dependent",
    },
    "DIALYSIS": {"Ye": "Yes"},
    "WNDINF": {0: "No", 1: "Yes"},
    "WTLOSS": {"Unknown": "Unknown_Other"},
    "PRSEPIS": {"SIRS": "Yes", "Sepsis": "Yes"},
    "SUPINFEC": {"No Complication": "No", "Superficial Incisional SSI": "Yes"},
    "WNDINFD": {"No Complication": "No", "Deep Incisional SSI": "Yes"},
    "ORGSPCSSI": {"No Complication": "No", "Organ/Space SSI": "Yes"},
    "DEHIS": {"No Complication": "No", "Wound Disruption": "Yes"},
    "OUPNEUMO": {"No Complication": "No", "Pneumonia": "Yes"},
    "REINTUB": {
        "No Complication": "No",
        "Unplanned Intub": "Yes",
        "Unplanned Intubation": "Yes",
    },
    "PULEMBOL": {"No Complication": "No", "Pulmonary Embolism": "Yes"},
    "FAILWEAN": {"No Complication": "No", "On Ventilator greater than 48 Hours": "Yes"},
    "RENAINSF": {
        "No Complication": "No",
        "Postop  Renal Insufficiency": "Yes",
        "Progressive Renal Insufficiency": "Yes",
        "Progressive Ren": "Yes",
    },
    "OPRENAFL": {
        "No Complication": "No",
        "Acute Renal Failure": "Yes",
        "Postop Dialysis": "Yes",
    },
    "URNINFEC": {"No Complication": "No", "Urinary Tract Infection": "Yes"},
    "CNSCVA": {"No Complication": "No", "Stroke/CVA": "Yes"},
    "CDARREST": {
        "No Complication": "No",
        "Cardiac Arrest Requiring CPR": "Yes",
        "Cardiac Arrest": "Yes",
    },
    "CDMI": {
        "No Complication": "No",
        "Myocardial Infarction": "Yes",
        "Myocardial Infa": "Yes",
    },
    "OTHBLEED": {
        "No Complication": "No",
        "Transfusions/Intraop/Postop": "Yes",
        "Blood Transfusion": "Yes",
    },
    "OTHDVT": {
        "No Complication": "No",
        "DVT Requiring Therapy": "Yes",
        "Venous Thrombosis Requiring Therapy": "Yes",
        "DVT Requiring Therap": "Yes",
    },
    "OTHSYSEP": {"No Complication": "No", "Sepsis": "Yes"},
    "OTHSESHOCK": {"No Complication": "No", "Septic Shock": "Yes"},
    "RETURNOR": {"NUL": "No"},
    "READMISSION1": {"NUL": "No"},
    "Age": {"90+": "90", " ": "-99"},
}

df_replaced = df_sub.replace(replace_dict)
df_replaced["Age"] = df_replaced["Age"].astype(int)
df_replaced["YRDEATH"] = np.where(df_replaced["YRDEATH"] == -99, "No", "Yes")
df_replaced.rename(columns={"YRDEATH": "Mortality"}, inplace=True)

Remove sparse entries

In [None]:
# Remove SEX Non-binary entries
df_replaced = df_replaced[df_replaced["SEX"] != "non-binary"]
# Remove Other/Unknown from FNSTATUS2
df_replaced = df_replaced[df_replaced["FNSTATUS2"] != "Other_Unknown"]
# Remove None Assigned entries from ASACLAS
df_replaced = df_replaced[df_replaced["ASACLAS"] != "None assigned"]

## Create Target Variables

In [None]:
###Surgical Wound Complications###
surg_wound_compl_cols = [
    "SUPINFEC",  # Superficial infection
    "WNDINFD",  # Deep infection
    "ORGSPCSSI",  # Organ space infection
    "DEHIS",  # Dehisence
]
y_surg_wnd_comp = (df_replaced[surg_wound_compl_cols] == "Yes").any(axis=1).astype(int)
y_surg_wnd_comp.name = "Surgical_Outcome"
###Bleed###
y_bleed = df_replaced["OTHBLEED"].map({"No": 0, "Yes": 1}).astype(int)
y_bleed.name = "Bleed_Outcome"
###Aspiration Complications###
asp_compl_cols = [
    "OUPNEUMO",  # Pnemonia
    "REINTUB",  # Reintubation
    "FAILWEAN",
]  # Ventilator > 48hrs
y_asp_comp = (df_replaced[asp_compl_cols] == "Yes").any(axis=1).astype(int)
y_asp_comp.name = "Aspiration_Outcome"
###Mortality###
y_mortality = df_replaced["Mortality"].map({"No": 0, "Yes": 1}).astype(int)
y_mortality.name = "Mortality_Outcome"


## Dict of outcome name and outcome data
OUTCOME_DICT = {
    "Surgical_Outcome": y_surg_wnd_comp,
    "Bleed_Outcome": y_bleed,
    "Aspiration_Outcome": y_asp_comp,
    "Mortality_Outcome": y_mortality,
}
## Dict of outcome name and sub-columnn names
outcome_sub_cols = {
    "Surgical_Outcome": surg_wound_compl_cols,
    "Bleed_Outcome": "OTHBLEED",
    "Aspiration_Outcome": asp_compl_cols,
    "Mortality_Outcome": "Mortality",
}

## Clean Numerical Features

Normalize NA

In [None]:
## ALL NA values are listed as -99/999, not NA/missing
numerical_cols = get_feature_lists(df_replaced)["numerical_cols"]
# numerical_cols = [col for col in df_sub.columns if len(df_sub[col].value_counts()) > 8]
df_replaced[numerical_cols] = df_replaced[numerical_cols].replace(-99, np.nan)
df_replaced[numerical_cols] = df_replaced[numerical_cols].replace(999, np.nan)
print("Proportion of NA vals in feature:")
print(df_replaced[numerical_cols].isna().mean())

Remove outliers

In [None]:
# Iterate through once and get limits to avoid any bias
df_clean = df_replaced.copy()
outlier_dict = {}
for col in numerical_cols:
    if col == "TOTHLOS":  # Not included in ML--> no need to remove outliers
        continue
    outlier_dict[col] = {}
    cur_col = df_clean[col]
    std = np.std(cur_col)
    mean = np.mean(cur_col)
    low_lim = mean - (3 * std)
    high_lim = mean + (3 * std)
    outlier_dict[col]["low_lim"] = low_lim
    outlier_dict[col]["high_lim"] = high_lim


print(f"Original Shape: {df_clean.shape[0]}")
# Use limits to remove outliers and generate
# df_clean = df_clean.reset_index(drop=True)
for col_name, lim_dict in outlier_dict.items():
    og_len = df_clean.shape[0]
    print(f"{'-'*10}{col_name} {'-'*10}")
    lower_lim = lim_dict["low_lim"]
    high_lim = lim_dict["high_lim"]
    keep_indices = (
        df_clean[(df_clean[col_name] > low_lim) & (df_clean[col_name] < high_lim)]
    ).index
    df_clean = df_clean.loc[keep_indices]
    print(f"Total number removed: {og_len - df_clean.shape[0]}")
    for outcome_name, og_outcome in OUTCOME_DICT.items():
        sub_outcome = og_outcome.loc[keep_indices]
        OUTCOME_DICT[outcome_name] = sub_outcome
        print(f"Outcome: {outcome_name}")
        print(
            f"Original pos counts: {og_outcome.value_counts().loc[1]} ({(og_outcome.value_counts().loc[1] / len(og_outcome)) * 100 :.1f}%)"
        )
        print(
            f"Pos remaining: {sub_outcome.value_counts().loc[1]} ({(sub_outcome.value_counts().loc[1] / len(sub_outcome)) * 100 :.1f}%)"
        )
print(f"Resulting Shape: {df_clean.shape[0]}")
print("*" * 100)

#####Reset all indices
df_clean = df_clean.reset_index(drop=True)
for outcome_name, outcome in OUTCOME_DICT.items():
    OUTCOME_DICT[outcome_name] = outcome.reset_index(drop=True)

## Export

Outcomes

In [None]:
outcome_df = pd.DataFrame(OUTCOME_DICT)
outcome_df.to_excel(BASE_PATH / "data" / "processed" / "Outcome_df.xlsx", index=False)

X_df

In [None]:
df_clean.to_excel(
    BASE_PATH / "data" / "processed" / "fully_cleaned_tongue_data.xlsx", index=True
)