# Extract/Filter on ICD/CPT codes

## Set Up

Packages/libraries + globals

In [None]:
import os
import sys

sys.path.append(os.path.abspath("../"))
from src.config import BASE_PATH
from src.data_utils import import_raw_data_dict
import pandas as pd
from src.clean_filter import create_and_filter_new_cols

## Path to directory where data saved
IMPORT_DIR = BASE_PATH / "data" / "raw" / "fully_raw"
## Import dict w/ format {year:data_file}
raw_data_dict = import_raw_data_dict(IMPORT_DIR)

# Clean up before filtering

## 2024 cleaning

Renaming cols

In [None]:
df_24 = raw_data_dict["NSQIP_24"].copy()
rename_dict = {
    "Case.Identification.Number": "CaseID",
    "Age.of.patient.with.patients.over.89.coded.as.90.": "AGE",
    "Height.in.inches": "HEIGHT",
    "Weight.in.lbs": "WEIGHT",
    "Gender": "SEX",
    "New.Race": "RACE_NEW",
    "Ethnicity.Hispanic": "ETHNICITY_HISPANIC",
    "Diabetes.mellitus.with.oral.agents.or.insulin": "DIABETES",
    "History.of.severe.COPD": "HXCOPD",
    "Heart.Failure.in.30.days.before.surgery": "HXCHF",
    "Ascites": "ASCITES",
    "Bleeding.disorders": "BLEEDDIS",
    "Transfusion...1.units.PRBCs.in.72.hours..before.surgery": "TRANSFUS",
    "Preop.Dialysis": "DIALYSIS",
    "Preop.Acute.Kidney.Injury": "RENAFAIL",
    "Systemic.sepsis": "PRSEPIS",
    "Pre.operative.serum.albumin": "PRALBUM",
    "Pre.operative.WBC": "PRWBC",
    "Hypertension.requiring.medication": "HYPERMED",
    "Ventilator.dependent": "VENTILAT",
    "Current.smoker.within.one.year": "SMOKE",
    "Disseminated.cancer": "DISCANCR",
    "Immunosuppressive.Therapy": "STEROID",
    "Functional.health.status.Prior.to.Surgery": "FNSTATUS2",
    "ASA.classification": "ASACLAS",
    "Total.operation.time": "OPTIME",
    ####### ICD #######
    "Post.op.diagnosis..ICD.10.": "PODIAG10",
    "Other.postoperative.occurrence.ICD.10.": "PODIAG_OTHER10",
    ###### CPT ########
    "CPT": "CPT",
    "Other.CPT.Code.1": "OTHERCPT1",
    "Other.CPT.Code.2": "OTHERCPT2",
    "Other.CPT.Code.3": "OTHERCPT3",
    "Other.CPT.Code.4": "OTHERCPT4",
    "Other.CPT.Code.5": "OTHERCPT5",
    "Other.CPT.Code.6": "OTHERCPT6",
    "Other.CPT.Code.7": "OTHERCPT7",
    "Other.CPT.Code.8": "OTHERCPT8",
    "Other.CPT.Code.9": "OTHERCPT9",
    "Other.CPT.Code.10": "OTHERCPT10",
    "Concurrent.CPT.1": "CONCPT1",
    "Concurrent.CPT.2": "CONCPT2",
    "Concurrent.CPT.3": "CONCPT3",
    "Concurrent.CPT.4": "CONCPT4",
    "Concurrent.CPT.5": "CONCPT5",
    "Concurrent.CPT.6": "CONCPT6",
    "Concurrent.CPT.7": "CONCPT7",
    "Concurrent.CPT.8": "CONCPT8",
    "Concurrent.CPT.9": "CONCPT9",
    "Concurrent.CPT.10": "CONCPT10",
    ####### OTHERS #########
    "Case.Acuity": "CASETYPE",
    "Surgical.Specialty": "SURGSPEC",
    "Principal.anesthesia.technique": "ANESTHES",
    "Inpatient.outpatient": "INOUT",
    "Year.of.Operation": "OperYr",
    "Length.of.total.hospital.stay": "TOTHLOS",
    "Year.of.death": "YRDEATH",
    "Return.to.OR": "RETURNOR",
    "Unplanned.Reoperation.1": "REOPERATION1",
    "Unplanned.Reoperation.2": "REOPERATION2",
    "More.than.2.unplanned.reoperations": "REOPERATION3",
    "Any.Readmission.1": "READMISSION1",
    "Any.Readmission.2": "READMISSION2",
    "Any.Readmission.3": "READMISSION3",
    "Any.Readmission.4": "READMISSION4",
    "Any.Readmission.5": "READMISSION5",
    "Unplanned.Readmission.1": "UNPLANNEDREADMISSION1",
    "Unplanned.Readmission.2": "UNPLANNEDREADMISSION2",
    "Unplanned.Readmission.3": "UNPLANNEDREADMISSION3",
    "Unplanned.Readmission.4": "UNPLANNEDREADMISSION4",
    "Unplanned.Readmission.5": "UNPLANNEDREADMISSION5",
    "Occurrences.Superficial.surgical.site.infection": "SUPINFEC",
    "Occurrences.Deep.Incisional.SSI": "WNDINFD",
    "Occurrences.Organ.Space.SSI": "ORGSPCSSI",
    "Number.of.Wound.Disruption.Occurrences": "DEHIS",
    "Occurrences.Blood.Transfusion": "OTHBLEED",
    "Occurrences.Vein.Thrombosis.Requiring.Therapy": "OTHDVT",
    "Occurrences.Pulmonary.Embolism": "PULEMBOL",
    "Occurrences.Pneumonia": "OUPNEUMO",
    "Occurrences.Unplanned.Intubation": "REINTUB",
    "Occurrences.Ventilator...48.Hours": "FAILWEAN",
    "Occurrences.Postop.Renal.Insufficiency": "RENAINSF",
    "Occurrences.Postop.Dialysis": "OPRENAFL",
    "Occurrences.UrinaryTractInfection": "URNINFEC",
    "CVA.Stroke.with.neurological.deficit": "CNSCVA",
    "Occurrences.Cardiac.Arrest.Requiring.CPR": "CDARREST",
    "Occurrences.Myocardial.Infarction": "CDMI",
    "Occurrences.Sepsis": "OTHSYSEP",
    "Occurrences.Septic.Shock": "OTHSESHOCK",
    "Discharge.Destination": "DISCHDEST",
    ## Extra Blood
    "Pre.operative.Hemoglobin.A1C": "PRHEMO_A1C",
    "Pre.operative.Hemoglobin": "PRHEMOGLOBIN",
    "Pre.operative.hematocrit": "PRHCT",
    "Pre.operative.platelet.count": "PRPLATE",
    "Pre.operative.International.Normalized.Ratio..INR..of.PT.values": "PRINR",
    "Pre.operative.PTT": "PRPTT",
}
try:
    df_24_sub = df_24[rename_dict.keys()].copy()
    raw_data_dict["NSQIP_24"] = df_24_sub.rename(columns=rename_dict)
except KeyError:
    print("Already filtered, continuing")

## More cleaning

Set up col lists/dicts

In [None]:
## Maps new col to codes to search for
new_icd_dict = {
    "CarcinomaICD": [("233", "prefix"), ("D05", "prefix")],  # Carcinoma in situ
    # Malignant neoplasm (+ history of neoplasm breast)
    "MalignantICD": [
        ("174", "prefix"),
        ("175", "prefix"),
        ("C50", "prefix"),
        ("V10", "prefix"),
        ("Z85", "prefix"),
    ],
    "MetastaticICD": [
        ("198.8", "prefix"),
        ("C78", "prefix"),
        ("C79", "prefix"),
    ],  # Metastatic
    "ProphylacticICD": [
        ("Z15", "prefix"),
        ("Z40", "prefix"),
        ("V50.4", "prefix"),
        ("V84", "prefix"),
    ],  # Prophylactic
    "AbBreastICD": [("793.8", "prefix"), ("R92", "prefix")],  # Abnormal Breast Imaging
    "BenignICD": [
        ("217", "prefix"),
        ("D24", "prefix"),
        ("610", "prefix"),
        ("N60", "prefix"),
    ],  # Benign
    # Inflammatory/Other Breast Disorder
    "InflOtherICD": [
        ("611", "prefix"),
        ("N61", "prefix"),
        ("N62", "prefix"),
        ("N63", "prefix"),
        ("N64", "prefix"),
    ],
    "CongICD": [("757.6", "prefix"), ("Q83", "prefix")],  # Congenital breast disorders
    "AbsICD": [("V45.71", "prefix"), ("Z90.1", "prefix")],  # Acquired Absence
}
## maps new col to codes to search for
new_cpt_dict = {
    ############ LYMPHATIC  ################
    "snlbCPT": [("38500", "exact"), ("38525", "exact")],
    "alndCPT": [
        ("38740", "exact"),
        ("38745", "exact"),
        ("19302", "exact"),
        ("19305", "exact"),
        ("19306", "exact"),
        ("19200", "exact"),
        ("19220", "exact"),
        ("19162", "exact"),
        ("19240", "exact"),
        ("19307", "exact"),
    ],
    "partialCPT": [
        ("19301", "exact"),
        ("19302", "exact"),
        ("19125", "exact"),
        ("19120", "exact"),
        ("19160", "exact"),
        ("19162", "exact"),
    ],
    ############ MASTECTOMY ################
    "subsimpleCPT": [
        ("19303", "exact"),
        ("19304", "exact"),
        ("19180", "exact"),
        ("19182", "exact"),
    ],
    "radicalCPT": [
        ("19305", "exact"),
        ("19306", "exact"),
        ("19200", "exact"),
        ("19220", "exact"),
    ],
    "modifiedRadicalCPT": [("19307", "exact"), ("19240", "exact")],
    ### Procedural Anatomy-->Count number of total mastectomy codes
    "procAnatCPT": [
        ("19301", "exact"),
        ("19302", "exact"),
        ("19125", "exact"),
        ("19120", "exact"),
        ("19160", "exact"),
        ("19162", "exact"),
        ("19303", "exact"),
        ("19304", "exact"),
        ("19180", "exact"),
        ("19182", "exact"),
        ("19305", "exact"),
        ("19306", "exact"),
        ("19200", "exact"),
        ("19220", "exact"),
        ("19307", "exact"),
        ("19240", "exact"),
    ],
    "immediateCPT": [("19340", "exact")],
    "delayedCPT": [("19342", "exact")],
    "teinsertionCPT": [("19357", "exact")],
    "teexpanderCPT": [("11970", "exact")],
    "freeCPT": [("19364", "exact")],
    "latCPT": [("19361", "exact")],
    "SinTramCPT": [("19367", "exact")],
    "SinTramSuperCPT": [("19368", "exact")],
    "BiTramCPT": [("19369", "exact")],
    "MastoCPT": [("19316", "exact")],
    "BreastRedCPT": [("19318", "exact")],
    "FatGraftCPT": [("15771", "exact"), ("15772", "exact")],
    "AdjTisTransCPT": [("14", "prefix")],
    "AugProsImpCPT": [("19325", "exact")],
    "OtherReconTechCPT": [("19366", "exact")],
    "RevRecBreastCPT": [("19380", "exact")],
    "npwtCPT": [
        ("97605", "exact"),
        ("97606", "exact"),
        ("97607", "exact"),
        ("97608", "exact"),
    ],
}

## ICD columns to search in
TARGET_ICD_COLS = [
    "PODIAG",
    "PODIAGTX",
    "PODIAG10",
    "PODIAGTX10",
    # "PODIAG_OTHER",
    # "PODIAG_OTHER10"
]
## CPT columns to search in
TARGET_CPT_COLS = [
    "PRNCPTX",
    "CPT",
    "OTHERCPT1",
    "OTHERCPT2",
    "OTHERCPT3",
    "OTHERCPT4",
    "OTHERCPT5",
    "OTHERCPT6",
    "OTHERCPT7",
    "OTHERCPT8",
    "OTHERCPT9",
    "OTHERCPT10",
    "CONCPT1",
    "CONCPT2",
    "CONCPT3",
    "CONCPT4",
    "CONCPT5",
    "CONCPT6",
    "CONCPT7",
    "CONCPT8",
    "CONCPT9",
    "CONCPT10",
]

## Blood cols
BLOOD_COLS = [
    "PRALBUM",
    "PRWBC",
    # "PRHEMO_A1C",
    # "PRHEMOGLOBIN",
    # "HEMO",
    "PRHCT",
    "PRPLATE",
    # "PRINR",
    # "PRPTT",
]
TARGET_DATA_COLS = [
    ## Demographics
    "AGE",
    "HEIGHT",
    "WEIGHT",
    "SEX",
    "RACE_NEW",
    "ETHNICITY_HISPANIC",
    ## Pre-op health + comorbidities
    "DIABETES",
    "RENAFAIL",
    "HXCOPD",
    "WNDINF",
    "DYSPNEA",
    "HXCHF",
    "ASCITES",
    "WTLOSS",
    "FNSTATUS2",
    # "PRWBC",
    # "PRALBUM",
    #### Can be either or for this one ####
    "BLEEDDIS",
    "BLEEDIS",
    ########################################
    "PRSEPIS",  # Pre-op sepsis
    "TRANSFUS",
    "DIALYSIS",
    "HYPERMED",
    "VENTILAT",
    "SMOKE",
    "DISCANCR",
    "STEROID",
    "ASACLAS",
    ## Surgical Characteristics
    "OPTIME",
    # Place all ICD cols here
    # Place CPT cols here
    ### Urgency ####
    "EMERGNCY",  # 08-20
    "CASETYPE",  # 21-23
    ################
    "SURGSPEC",
    "ANESTHES",
    "INOUT",
    "OperYR",
    ### Peri- and post-op
    "TOTHLOS",
    "YRDEATH",
    ### Unplanned ReOp ###
    "RETURNOR",
    "REOPERATION",
    "REOPERATION1",
    "REOPERATION2",
    "REOPERATION3",
    #######################
    ### ReAd ### --> 08-10 not reported
    "READMISSION",
    "READMISSION1",
    "READMISSION2",
    "READMISSION3",
    "READMISSION4",
    "READMISSION5",
    #############
    ### Unplanned ReAd ### --> 08-10 not reported
    "UNPLANNEDREADMISSION",  # this is in none of them
    "UNPLANNEDREADMISSION1",
    "UNPLANNEDREADMISSION2",
    "UNPLANNEDREADMISSION3",
    "UNPLANNEDREADMISSION4",
    "UNPLANNEDREADMISSION5",
    #######################
    #### Surgical Complications (make summary variable too)
    "SUPINFEC",
    "WNDINFD",
    "ORGSPCSSI",
    "DEHIS",
    "OTHBLEED",
    #### Medical Complications (make summary variable too)
    "OTHDVT",
    "PULEMBOL",
    "OUPNEUMO",
    "REINTUB",
    "FAILWEAN",
    "URNINFEC",
    "CNSCVA",
    "CDARREST",
    "CDMI",
    "OTHSYSEP",
    "OTHSESHOCK",
    "OPRENAFL",
    "RENAINSF",
    "DISCHDEST",
]
TARGET_DATA_COLS += BLOOD_COLS
## All columns we want in the df
TARGET_COLS = TARGET_ICD_COLS + TARGET_CPT_COLS + TARGET_DATA_COLS
## Add new ICD/CPT columns to columns we want in df
TARGET_COLS += list(new_icd_dict.keys())
TARGET_COLS += list(new_cpt_dict.keys())
## Make all upper-case to normalize
TARGET_COLS = [col.upper() for col in TARGET_COLS]

Extract/filter on ICD

In [None]:
## Path to export ICD-filtered data
ICD_DIR = BASE_PATH / "data" / "raw" / "icd_filtered"

## Base file
icd_df_dict_base = create_and_filter_new_cols(
    new_col_dict=new_icd_dict,
    old_df_dict=raw_data_dict,
    export_dir=ICD_DIR,
    target_cols=TARGET_COLS,
    target_code_cols=TARGET_ICD_COLS,
    # Filter on all ICD cols
    filter_cols=list(new_icd_dict.keys()),
    extra_filtered=False,
    cpt_flag=False,
)

Extract/Filter on CPT

In [None]:
## Directory to export CPT-filtered files
CPT_DIR = BASE_PATH / "data" / "raw" / "cpt_filtered"

## Base file
# Organize cols to filter on
new_cpt_cols = list(new_cpt_dict.keys())
base_cols_remove = ["snlbCPT", "alndCPT", "npwtCPT", "procAnatCPT"]
base_filter_cols = [col for col in new_cpt_cols if col not in base_cols_remove]
# Create + filter
cpt_df_dict_base = create_and_filter_new_cols(
    new_col_dict=new_cpt_dict,
    old_df_dict=icd_df_dict_base,
    export_dir=CPT_DIR,
    target_cols=TARGET_COLS,
    target_code_cols=TARGET_CPT_COLS,
    # Filter on all CPT cols except lymph cols + NPWT
    filter_cols=base_filter_cols,
    extra_filtered=False,
    cpt_flag=True,
)