# Data Cleaning

Merge DFs, subset cols, create new cols

## Set Up

In [None]:
import sys
import os

sys.path.append(os.path.abspath("../"))
from src.config import BASE_PATH
from src.data_utils import import_raw_data_dict, get_feature_lists
from src.clean_filter import merge_dfs
import pandas as pd
import numpy as np
from shutil import rmtree
import warnings

### Merge DFs

In [None]:
data_path = BASE_PATH / "data" / "raw" / "cpt_filtered"
data_dict = import_raw_data_dict(data_path)
combined_df_no_codes, combined_df_w_codes = merge_dfs(data_dict)

## Clean Resulting categorical instances

In [None]:
replace_dict = {
    "SEX": {"non-bi": "non-binary"},
    "RACE_NEW": {
        "Unknown/Not Reported": "otherUnknown",
        "Unknown": "otherUnknown",
        "Some Other Race": "otherUnknown",
        "Native Hawaiian or Other Pacific Islander": "otherUnknown",
        "American Indian or Alaska": "otherUnknown",
        "Native Hawaiian or Pacifi": "otherUnknown",
        "American Indian or Alaska Native": "otherUnknown",
        "White,Black or African American": "otherUnknown",
        "Native Hawaiian or Other Pacific Islander,Asian": "otherUnknown",
        "Asian,Some Other Race": "otherUnknown",
        "Native Hawaiian or Pacific Islander": "otherUnknown",
        "American Indian or Alaskan Native": "otherUnknown",
        "White,Asian": "otherUnknown",
        "White,Native Hawaiian or Other Pacific Islander": "otherUnknown",
        "Hispanic or Latino": "otherUnknown",
        "Middle Eastern or North African": "otherUnknown",
        "NULL": "otherUnknown",
        "nan": "otherUnknown",
    },
    "ETHNICITY_HISPANIC": {
        "N": "noUnknown",
        "No": "noUnknown",
        "U": "noUnknown",
        "Unknown": "noUnknown",
        "Unk": "noUnknown",
        "Y": "Yes",
        "NULL": "noUnknown",
        "nan": "noUnknown",
    },
    "INOUT": {"Inpatient": "Yes", "Outpatient": "No"},
    "ANESTHES": {
        "None": "otherUnknown",
        "Other": "otherUnknown",
        "Monitored Anesthesia Care": "otherUnknown",
        "MAC/IV Sedation": "otherUnknown",
        "Unknown": "otherUnknown",
        "Spinal": "otherUnknown",
        "Local": "otherUnknown",
    },
    "SURGSPEC": {
        "Urology": "otherUnkown",
        "Vascular": "otherUnkown",
        "Cardiac Surgery": "otherUnkown",
        "Neurosurgery": "otherUnkown",
        "Orthopedics": "otherUnkown",
    },
    "DISCHDEST": {
        "Home": "HomePermRes",
        "Home/Permanent Residence": "HomePermRes",
        "Unknown": "otherUnknown",
        "Facility Which was Home": "HomePermRes",
        "Separate Acute Care": "Acute Care",
        "Acute Care Hospital": "Acute Care",
        "Unskilled Facility Not Home": "otherUnknown",
        "Hospice": "otherUnknown",
        "Other Facility": "otherUnknown",
        "Against Medical Advice (AMA)": "otherUnknown",
        "None": "otherUnknown",
        "NULL": "otherUnknown",
        "nan": "otherUnknown",
    },
    "URGENCY": {
        "Yes": "Urgent_Emergent",
        "No": "Elective",
        "Urgent": "Urgent_Emergent",
        "Emergent": "Urgent_Emergent",
        "Unk": "Unknown",
    },
    "DIABETES": {
        "NO": "No",
        "NON-INSULIN": "Yes",
        "INSULIN": "Yes",
        "ORAL": "Yes",
    },
    "DYSPNEA": {
        "Unknown": "otherUnknown",
        "MODERATE EXERTION": "Yes",
        "AT REST": "Yes",
        "None": "No",
    },
    "FNSTATUS2": {
        "Unknown": "otherUnknown",
        "Partially Dependent": "Dependent",
        "Partially D": "Dependent",
        "Totally Dependent": "Dependent",
        "Totally Dep": "Dependent",
    },
    "DIALYSIS": {"Ye": "Yes"},
    "WNDINF": {0: "No", 1: "Yes", "None": "No"},
    "WTLOSS": {"Unknown": "otherUnknown", "None": "No"},
    "PRSEPIS": {"SIRS": "Yes", "Sepsis": "Yes", "None": "No", "nan": "No"},
    "SUPINFEC": {"No Complication": "No", "Superficial Incisional SSI": "Yes"},
    "ORGSPCSSI": {"No Complication": "No", "Organ/Space SSI": "Yes"},
    "DEHIS": {
        "No Complication": "No",
        "Wound Disruption": "Yes",
        "0": "No",
        "1": "Yes",
    },
    "OUPNEUMO": {"No Complication": "No", "Pneumonia": "Yes"},
    "REINTUB": {
        "No Complication": "No",
        "Unplanned Intub": "Yes",
        "Unplanned Intubation": "Yes",
    },
    "PULEMBOL": {"No Complication": "No", "Pulmonary Embolism": "Yes"},
    "FAILWEAN": {"No Complication": "No", "On Ventilator greater than 48 Hours": "Yes"},
    "RENAINSF": {
        "No Complication": "No",
        "Postop  Renal Insufficiency": "Yes",
        "Progressive Renal Insufficiency": "Yes",
        "Progressive Ren": "Yes",
        "None": "No",
        "Postop† Renal†Insufficiency": "Yes",
    },
    "RENAFAIL": {
        "None": "No",
    },
    "OPRENAFL": {
        "No Complication": "No",
        "Acute Renal Failure": "Yes",
        "Postop Dialysis": "Yes",
    },
    "URNINFEC": {"No Complication": "No", "Urinary Tract Infection": "Yes"},
    "CNSCVA": {"No Complication": "No", "Stroke/CVA": "Yes"},
    "CDARREST": {
        "No Complication": "No",
        "Cardiac Arrest Requiring CPR": "Yes",
        "Cardiac Arrest": "Yes",
    },
    "CDMI": {
        "No Complication": "No",
        "Myocardial Infarction": "Yes",
        "Myocardial Infa": "Yes",
    },
    "OTHBLEED": {
        "No Complication": "No",
        "Transfusions/Intraop/Postop": "Yes",
        "Blood Transfusion": "Yes",
    },
    "OTHDVT": {
        "No Complication": "No",
        "DVT Requiring Therapy": "Yes",
        "Venous Thrombosis Requiring Therapy": "Yes",
        "DVT Requiring Therap": "Yes",
    },
    "OTHSYSEP": {"No Complication": "No", "Sepsis": "Yes"},
    "OTHSESHOCK": {"No Complication": "No", "Septic Shock": "Yes"},
    "RETURNOR": {"NUL": "No"},
    "AGE": {"90+": "90", " ": "-99"},
    "ASACLAS": {
        "5-Moribund": "4/5-Life Threat/Moribund",
        "4-Life Threat": "4/5-Life Threat/Moribund",
        "None assigned": "unknown",
    },
    "YRDEATH": {"-99": "No", "-99.0": "No"},
    "UNPLREOP": {"nan": "No"},
    "READ": {"None": "No", "nan": "No"},
    "UNPLREAD": {"None": "No", "nan": "No"},
}

df_combined = combined_df_no_codes.copy()
### Make all column lists upper case
df_combined.columns = df_combined.columns.str.upper()
# Remove leading/trailing 0s
string_cols = df_combined.select_dtypes(include=["object"]).columns
for col in string_cols:
    df_combined[col] = df_combined[col].astype(str).str.strip()
## Normalize instance names
with warnings.catch_warnings():
    warnings.filterwarnings(
        "ignore", category=FutureWarning, message=".*Downcasting behavior.*"
    )
    df_replaced = df_combined.replace(replace_dict).infer_objects(copy=False).copy()
## Deal with mortality
df_replaced["MORTALITY"] = np.where(df_replaced["YRDEATH"] == "No", "No", "Yes")
df_replaced = df_replaced.drop(["YRDEATH"], axis=1)
df_replaced["MORTALITY"] = np.where(
    (df_replaced["MORTALITY"] == "Yes") | (df_replaced["DISCHDEST"] == "Expired"),
    "Yes",
    df_replaced["MORTALITY"],
)
## Remove sparse entries
df_replaced = df_replaced[df_replaced["SEX"] != "non-binary"]
df_replaced = df_replaced[df_replaced["ASACLAS"] != "unknown"]

In [None]:
df_replaced["OPERYR"] = df_replaced["OPERYR"].astype(float).astype(int)
df_replaced[df_replaced["OPERYR"] == 2021]["RENAINSF"].unique()

Change "No"s to NA if not recorded for a given year

In [None]:
df_unknowns = df_replaced.copy()
df_unknowns["OPERYR"] = df_unknowns["OPERYR"].astype(float).astype(int)

# READ --> not recorded 08-10
df_unknowns.loc[
    (df_unknowns["READ"] == "No") & df_unknowns["OPERYR"].between(2008, 2010),
    "READ",
] = "Unknown(08-10)"
# UNPLREAD --> 08-11
df_unknowns.loc[
    (df_unknowns["UNPLREAD"] == "No") & df_unknowns["OPERYR"].between(2008, 2011),
    "UNPLREAD",
] = "Unknown(08-11)"
# RENAINSF --> 21
df_unknowns.loc[
    (df_unknowns["RENAINSF"] == "No") & (df_unknowns["OPERYR"] == 2021),
    "RENAINSF",
] = "Unknown(21)"
# RENAFAIL --> 21
df_unknowns.loc[
    (df_unknowns["RENAFAIL"] == "No") & (df_unknowns["OPERYR"] == 2021),
    "RENAFAIL",
] = "Unknown(21)"
# WTLOSS --> 21-24
df_unknowns.loc[
    (df_unknowns["WTLOSS"] == "No") & df_unknowns["OPERYR"].between(2021, 2024),
    "WTLOSS",
] = "Unknown(21-24)"
# WNDINF --> 21-24
df_unknowns.loc[
    (df_unknowns["WNDINF"] == "No") & df_unknowns["OPERYR"].between(2021, 2024),
    "WNDINF",
] = "Unknown(21-24)"
# DYSPNEA --> 21-24
df_unknowns.loc[
    (df_unknowns["DYSPNEA"] == "No") & df_unknowns["OPERYR"].between(2021, 2024),
    "DYSPNEA",
] = "Unknown(21-24)"

In [None]:
numerical_cols = get_feature_lists(df_unknowns)["numerical_cols"]
other_cols = [col for col in df_unknowns.columns if col not in numerical_cols]
assert df_unknowns[other_cols].isna().sum().sum() == 0

## Clean Numerical Features

Explore NA

In [None]:
## ALL NA values are listed as -99/999, not NA/missing
numerical_cols = get_feature_lists(df_unknowns)["numerical_cols"]
df_unknowns[numerical_cols] = df_unknowns[numerical_cols].astype(float)
df_unknowns[numerical_cols] = df_unknowns[numerical_cols].replace(-99, np.nan)
df_unknowns[numerical_cols] = df_unknowns[numerical_cols].replace(999, np.nan)
print("Proportion (%) of NA vals in feature:")
print(round((df_unknowns[numerical_cols].isna().mean() * 100), 2))

Remove outliers

In [None]:
display(df_unknowns[numerical_cols].describe())
## Remove
print(f"Total removed due to 0 OPTIME: {(df_unknowns['OPTIME'] == 0).sum()}")
df_unknowns = df_unknowns[df_unknowns["OPTIME"] != 0]

## Create Target Variables

In [None]:
### Reset indices
df_clean = df_unknowns.reset_index(drop=True)
### =================================> Surgical Wound Complications###
surg_wound_compl_cols = [
    "SUPINFEC",  # Superficial infection
    "WNDINFD",  # Deep infection
    "ORGSPCSSI",  # Organ space infection
    "DEHIS",  # Dehisence
]
y_surg_wnd_comp = (df_clean[surg_wound_compl_cols] == "Yes").any(axis=1).astype(int)
y_surg_wnd_comp.name = "Surgical_Outcome"
### =================================> Bleed
y_bleed = df_clean["OTHBLEED"].map({"No": 0, "Yes": 1}).astype(int)
y_bleed.name = "Bleed_Outcome"
### =================================> Aspiration Complications###
asp_compl_cols = [
    "OUPNEUMO",  # Pnemonia
    "REINTUB",  # Reintubation
    "FAILWEAN",
]  # Ventilator > 48hrs
y_asp_comp = (df_clean[asp_compl_cols] == "Yes").any(axis=1).astype(int)
y_asp_comp.name = "Aspiration_Outcome"
### =================================> Mortality
y_mortality = df_clean["MORTALITY"].map({"No": 0, "Yes": 1}).astype(int)
y_mortality.name = "Mortality_Outcome"
### =================================> UnplannedReOp
y_reop = df_clean["UNPLREOP"].map({"No": 0, "Yes": 1}).astype(int)
y_reop.name = "ReOp_Outcome"
## Dict of outcome name and outcome data
OUTCOME_DICT = {
    "Surgical_Outcome": y_surg_wnd_comp,
    "Bleed_Outcome": y_bleed,
    "Aspiration_Outcome": y_asp_comp,
    "Mortality_Outcome": y_mortality,
    "ReOp_Outcome": y_reop,
}
## Dict of outcome name and sub-columnn names
outcome_sub_cols = {
    "Surgical_Outcome": surg_wound_compl_cols,
    "Bleed_Outcome": "OTHBLEED",
    "Aspiration_Outcome": asp_compl_cols,
    "Mortality_Outcome": ["Mortality", "DISCHDEST"],
    "ReOp_Outcome": "UNPLREOP",
}

## Export

Outcomes

In [None]:
## Outcomes
outcome_df = pd.DataFrame(OUTCOME_DICT)
outcome_path = BASE_PATH / "data" / "processed" / "Outcome_df.xlsx"
if outcome_path.exists():
    outcome_path.unlink()
    warnings.warn(f"Over-writing outcome data at {outcome_path}")
outcome_path.parent.mkdir(exist_ok=True, parents=True)
outcome_df.to_excel(outcome_path, index=True)

## X_df
df_path = BASE_PATH / "data" / "processed" / "fully_cleaned_tongue_data.xlsx"
if df_path.exists():
    df_path.unlink()
    warnings.warn(f"Over-writing tabular data at {df_path}")
df_path.parent.mkdir(exist_ok=True, parents=True)
df_clean.to_excel(df_path, index=True)