# Extract/Filter on ICD/CPT Codes

## Set Up

In [None]:
import os
import sys

sys.path.append(os.path.abspath("../"))
from src.config import BASE_PATH
from src.data_utils import import_raw_data_dict
import pandas as pd
from src.clean_filter import create_and_filter_new_cols

## Path to directory where data saved
IMPORT_DIR = BASE_PATH / "data" / "raw" / "fully_raw"
## Import dict w/ format {year:data_file}
raw_data_dict = import_raw_data_dict(IMPORT_DIR)

## Clean Up before filtering

### 2024 processing
- Just need to change column names

In [None]:
df_24 = raw_data_dict["NSQIP_24"].copy()
rename_dict = {
    "Case.Identification.Number": "CaseID",
    "Age.of.patient.with.patients.over.89.coded.as.90.": "AGE",
    "Height.in.inches": "HEIGHT",
    "Weight.in.lbs": "WEIGHT",
    "Gender": "SEX",
    "New.Race": "RACE_NEW",
    "Ethnicity.Hispanic": "ETHNICITY_HISPANIC",
    "Diabetes.mellitus.with.oral.agents.or.insulin": "DIABETES",
    "History.of.severe.COPD": "HXCOPD",
    "Heart.Failure.in.30.days.before.surgery": "HXCHF",
    "Ascites": "ASCITES",
    "Bleeding.disorders": "BLEEDDIS",
    "Transfusion...1.units.PRBCs.in.72.hours..before.surgery": "TRANSFUS",
    "Preop.Dialysis": "DIALYSIS",
    "Preop.Acute.Kidney.Injury": "RENAFAIL",
    "Systemic.sepsis": "PRSEPIS",
    "Pre.operative.serum.albumin": "PRALBUM",
    "Pre.operative.WBC": "PRWBC",
    "Hypertension.requiring.medication": "HYPERMED",
    "Ventilator.dependent": "VENTILAT",
    "Current.smoker.within.one.year": "SMOKE",
    "Disseminated.cancer": "DISCANCR",
    "Immunosuppressive.Therapy": "STEROID",
    "Functional.health.status.Prior.to.Surgery": "FNSTATUS2",
    "ASA.classification": "ASACLAS",
    "Total.operation.time": "OPTIME",
    ####### ICD #######
    "Post.op.diagnosis..ICD.10.": "PODIAG10",
    "Other.postoperative.occurrence.ICD.10.": "PODIAG_OTHER10",
    ###### CPT ########
    "CPT": "CPT",
    "Other.CPT.Code.1": "OTHERCPT1",
    "Other.CPT.Code.2": "OTHERCPT2",
    "Other.CPT.Code.3": "OTHERCPT3",
    "Other.CPT.Code.4": "OTHERCPT4",
    "Other.CPT.Code.5": "OTHERCPT5",
    "Other.CPT.Code.6": "OTHERCPT6",
    "Other.CPT.Code.7": "OTHERCPT7",
    "Other.CPT.Code.8": "OTHERCPT8",
    "Other.CPT.Code.9": "OTHERCPT9",
    "Other.CPT.Code.10": "OTHERCPT10",
    "Concurrent.CPT.1": "CONCPT1",
    "Concurrent.CPT.2": "CONCPT2",
    "Concurrent.CPT.3": "CONCPT3",
    "Concurrent.CPT.4": "CONCPT4",
    "Concurrent.CPT.5": "CONCPT5",
    "Concurrent.CPT.6": "CONCPT6",
    "Concurrent.CPT.7": "CONCPT7",
    "Concurrent.CPT.8": "CONCPT8",
    "Concurrent.CPT.9": "CONCPT9",
    "Concurrent.CPT.10": "CONCPT10",
    ####### OTHERS #########
    "Case.Acuity": "CASETYPE",
    "Surgical.Specialty": "SURGSPEC",
    "Principal.anesthesia.technique": "ANESTHES",
    "Inpatient.outpatient": "INOUT",
    "Year.of.Operation": "OperYr",
    "Length.of.total.hospital.stay": "TOTHLOS",
    "Year.of.death": "YRDEATH",
    "Return.to.OR": "RETURNOR",
    "Unplanned.Reoperation.1": "REOPERATION1",
    "Unplanned.Reoperation.2": "REOPERATION2",
    "More.than.2.unplanned.reoperations": "REOPERATION3",
    "Any.Readmission.1": "READMISSION1",
    "Any.Readmission.2": "READMISSION2",
    "Any.Readmission.3": "READMISSION3",
    "Any.Readmission.4": "READMISSION4",
    "Any.Readmission.5": "READMISSION5",
    "Unplanned.Readmission.1": "UNPLANNEDREADMISSION1",
    "Unplanned.Readmission.2": "UNPLANNEDREADMISSION2",
    "Unplanned.Readmission.3": "UNPLANNEDREADMISSION3",
    "Unplanned.Readmission.4": "UNPLANNEDREADMISSION4",
    "Unplanned.Readmission.5": "UNPLANNEDREADMISSION5",
    "Occurrences.Superficial.surgical.site.infection": "SUPINFEC",
    "Occurrences.Deep.Incisional.SSI": "WNDINFD",
    "Occurrences.Organ.Space.SSI": "ORGSPCSSI",
    "Number.of.Wound.Disruption.Occurrences": "DEHIS",
    "Occurrences.Blood.Transfusion": "OTHBLEED",
    "Occurrences.Vein.Thrombosis.Requiring.Therapy": "OTHDVT",
    "Occurrences.Pulmonary.Embolism": "PULEMBOL",
    "Occurrences.Pneumonia": "OUPNEUMO",
    "Occurrences.Unplanned.Intubation": "REINTUB",
    "Occurrences.Ventilator...48.Hours": "FAILWEAN",
    "Occurrences.Postop.Renal.Insufficiency": "RENAINSF",
    "Occurrences.Postop.Dialysis": "OPRENAFL",
    "Occurrences.UrinaryTractInfection": "URNINFEC",
    "CVA.Stroke.with.neurological.deficit": "CNSCVA",
    "Occurrences.Cardiac.Arrest.Requiring.CPR": "CDARREST",
    "Occurrences.Myocardial.Infarction": "CDMI",
    "Occurrences.Sepsis": "OTHSYSEP",
    "Occurrences.Septic.Shock": "OTHSESHOCK",
    "Discharge.Destination": "DISCHDEST",
    ## Extra Blood
    "Pre.operative.Hemoglobin.A1C": "PRHEMO_A1C",
    "Pre.operative.Hemoglobin": "PRHEMOGLOBIN",
    "Pre.operative.hematocrit": "PRHCT",
    "Pre.operative.platelet.count": "PRPLATE",
    "Pre.operative.International.Normalized.Ratio..INR..of.PT.values": "PRINR",
    "Pre.operative.PTT": "PRPTT",
}
try:
    df_24_sub = df_24[rename_dict.keys()].copy()
    raw_data_dict["NSQIP_24"] = df_24_sub.rename(columns=rename_dict)
except KeyError:
    print("Already filtered, continuing")

### More cleaning

Set up col lists/dicts

In [None]:
## Maps new col to codes to search for
new_icd_dict = {
    "Malignant neoplasm of base of tongue": [("C01", "prefix"), ("141.0", "prefix")],
    "Malignant neoplasm of surface of tongue": [
        ("C02.0", "prefix"),  # dorsal
        ("141.1", "prefix"),  # dorsal
        ("C02.2", "prefix"),  # ventral
        ("141.3", "prefix"),  # ventral
    ],
    "Malignant neoplasm of border of tongue": [
        ("C02.1", "prefix"),
        ("141.2", "prefix"),
    ],
    "Malignant neoplasm of anterior two-thirds of tongue unspecified": [
        ("C02.3", "prefix"),
        ("141.4", "prefix"),
    ],
    "Malignant neoplasm of junctional zone of tongue": [
        ("C02.8", "prefix"),
        ("141.5", "prefix"),
    ],
    "Malignant neoplasm of lingual tonsil": [
        ("C02.4", "prefix"),
        ("141.6", "prefix"),
    ],  # Combined
    "Malignant neoplasm of tongue unspecified": [
        ("141", "exact"),  # EXACT
        ("141.8", "prefix"),
        ("141.9", "prefix"),
        ("C02", "exact"),  # EXACT
        ("C02.", "exact"),  # EXACT
        ("141.", "exact"),  # EXACT
        ("C02.9", "prefix"),
    ],
}
## maps new col to codes to search for
new_cpt_dict = {
    "Partial Glossectomy (Hemiglossectomy_Subtotal)": [
        ("41120", "exact"),
        ("41130", "exact"),
        ("41135", "exact"),
    ],
    "Composite_Extended Glossectomy": [
        ("41150", "exact"),
        ("41153", "exact"),
        ("41155", "exact"),
    ],
    "Total Glossectomy (Complete Tongue Removal)": [
        ("41140", "exact"),
        ("41145", "exact"),
    ],
    "Excision of Tongue Lesions (Minor)": [
        ("41110", "exact"),
        ("41100", "exact"),
        ("41112", "exact"),
        ("41113", "exact"),
        ("41114", "exact"),
        ("41116", "exact"),
        ("41105", "exact"),
    ],
    "Local_Regional Tissue Flaps for Oral Cavity Reconstruction": [
        ("14301", "exact"),
        ("14021", "exact"),
        ("14302", "exact"),
        ("14040", "exact"),
        ("14020", "exact"),
        ("14041", "exact"),
        ("15733", "exact"),
        ("15740", "exact"),
    ],
    "Free Tissue Transfer (Microvascular Free Flaps) and Complex Flap Reconstruction": [
        ("15732", "exact"),
        ("15734", "exact"),
        ("15736", "exact"),
        ("15738", "exact"),
        ("15750", "exact"),
        ("20955", "exact"),
        ("15756", "exact"),
        ("15757", "exact"),
        ("15758", "exact"),
        ("42894", "exact"),
        ("15770", "exact"),
        ("20969", "exact"),
        ("21215", "exact"),
        ("21230", "exact"),
        ("20962", "exact"),
        ("20902", "exact"),
        ("20962", "exact"),
    ],
    "Skin Autografts for Head and Neck Reconstruction": [
        ("15100", "exact"),
        ("15120", "exact"),
        ("15220", "exact"),
        ("15240", "exact"),
        ("15101", "exact"),
        ("15004", "exact"),
        ("15200", "exact"),
        ("15241", "exact"),
        ("15121", "exact"),
        ("15275", "exact"),
        ("15221", "exact"),
        ("15271", "exact"),
        ("15272", "exact"),
        ("15273", "exact"),
    ],
    "Neck Dissection and Lymphadenectomy Procedures": [
        ("38700", "exact"),
        ("38720", "exact"),
        ("38724", "exact"),
        ("38510", "exact"),
        ("38500", "exact"),
        ("38542", "exact"),
        ("31365", "exact"),
        ("41135", "exact"),
        ("41140", "exact"),
        ("41145", "exact"),
        ("41153", "exact"),
        ("41155", "exact"),
    ],
    "Alveolar Ridge and Gingival Procedures": [
        ("41874", "exact"),
        ("40845", "exact"),
        ("40840", "exact"),
    ],
    "Mandibular Resection and Reconstruction Procedures": [
        ("21198", "exact"),
        ("21244", "exact"),
        ("21461", "exact"),
        ("21045", "exact"),
        ("21044", "exact"),
        ("21025", "exact"),
        ("21196", "exact"),
        ("21245", "exact"),
        ("21047", "exact"),
    ],
    "Peripheral Nerve Repair and Neuroplasty": [
        ("64716", "exact"),
        ("64886", "exact"),
        ("64885", "exact"),
        ("64864", "exact"),
        ("64740", "exact"),
    ],
    "Tracheostomy Procedures": [
        ("31600", "exact"),
        ("31610", "exact"),
        ("31611", "exact"),
        ("31603", "exact"),
    ],
    "Gastrostomy and Esophageal Access Procedures": [
        ("43830", "exact"),
        ("49440", "exact"),
        ("43030", "exact"),
        ("44500", "exact"),
        ("44120", "exact"),
        ("43832", "exact"),
    ],
    "Submandibular Gland Excision": [
        ("42440", "exact"),
        ("42420", "exact"),
        ("42450", "exact"),
    ],
    "Parotid Gland Excision": [
        ("42415", "exact"),
        ("42410", "exact"),
        ("42505", "exact"),
    ],
    "Laryngeal Resection and Reconstruction Procedures": [
        ("31360", "exact"),
        ("31367", "exact"),
        ("31599", "exact"),
        ("31365", "exact"),
        ("31395", "exact"),
    ],
    "Pharyngeal Resection and Reconstruction Procedures": [
        ("42890", "exact"),
        ("31395", "exact"),
        ("42808", "exact"),
        ("42892", "exact"),
        ("42894", "exact"),
        ("42950", "exact"),
        ("42953", "exact"),
        ("42962", "exact"),
    ],
    "Tonsillectomy and Tonsillar Region Procedures": [
        ("42826", "exact"),
        ("42842", "exact"),
        ("42845", "exact"),
        ("42844", "exact"),
        ("42961", "exact"),
        ("42821", "exact"),
        ("42870", "exact"),
    ],
}

## ICD columns to search in
TARGET_ICD_COLS = [
    "PODIAG",
    "PODIAGTX",
    "PODIAG10",
    "PODIAGTX10",
    "PODIAG_OTHER",
    "PODIAG_OTHER10",
]
## CPT columns to search in
TARGET_CPT_COLS = [
    "PRNCPTX",
    "CPT",
    "OTHERCPT1",
    "OTHERCPT2",
    "OTHERCPT3",
    "OTHERCPT4",
    "OTHERCPT5",
    "OTHERCPT6",
    "OTHERCPT7",
    "OTHERCPT8",
    "OTHERCPT9",
    "OTHERCPT10",
    "CONCPT1",
    "CONCPT2",
    "CONCPT3",
    "CONCPT4",
    "CONCPT5",
    "CONCPT6",
    "CONCPT7",
    "CONCPT8",
    "CONCPT9",
    "CONCPT10",
]
## Blood cols
BLOOD_COLS = [
    "PRALBUM",
    "PRWBC",
    # "PRHEMO_A1C",
    # "PRHEMOGLOBIN",
    # "HEMO",
    "PRHCT",
    "PRPLATE",
    # "PRINR",
    # "PRPTT",
]
TARGET_DATA_COLS = [
    "CASEID",
    ## Demographics
    "AGE",
    "HEIGHT",
    "WEIGHT",
    "SEX",
    "RACE_NEW",
    "ETHNICITY_HISPANIC",
    ## Pre-op health + comorbidities
    "DIABETES",
    "RENAFAIL",
    "HXCOPD",
    "WNDINF",
    "DYSPNEA",
    "HXCHF",
    "ASCITES",
    "WTLOSS",
    "FNSTATUS2",
    # "PRWBC",
    # "PRALBUM",
    #### Can be either or for this one ####
    "BLEEDDIS",
    "BLEEDIS",
    ########################################
    "PRSEPIS",  # Pre-op sepsis
    "TRANSFUS",
    "DIALYSIS",
    "HYPERMED",
    "VENTILAT",
    "SMOKE",
    "DISCANCR",
    "STEROID",
    "ASACLAS",
    ## Surgical Characteristics
    "OPTIME",
    # Place all ICD cols here
    # Place CPT cols here
    ### Urgency ####
    "EMERGNCY",  # 08-20
    "CASETYPE",  # 21-23
    ################
    "SURGSPEC",
    "ANESTHES",
    "INOUT",
    "OperYR",
    ### Peri- and post-op
    "TOTHLOS",
    "YRDEATH",
    ### Unplanned ReOp ###
    "RETURNOR",
    "REOPERATION",
    "REOPERATION1",
    "REOPERATION2",
    "REOPERATION3",
    #######################
    ### ReAd ### --> 08-10 not reported
    "READMISSION",
    "READMISSION1",
    "READMISSION2",
    "READMISSION3",
    "READMISSION4",
    "READMISSION5",
    #############
    ### Unplanned ReAd ### --> 08-10 not reported
    "UNPLANNEDREADMISSION",  # this is in none of them
    "UNPLANNEDREADMISSION1",
    "UNPLANNEDREADMISSION2",
    "UNPLANNEDREADMISSION3",
    "UNPLANNEDREADMISSION4",
    "UNPLANNEDREADMISSION5",
    #######################
    #### Surgical Complications (make summary variable too)
    "SUPINFEC",
    "WNDINFD",
    "ORGSPCSSI",
    "DEHIS",
    "OTHBLEED",
    #### Medical Complications (make summary variable too)
    "OTHDVT",
    "PULEMBOL",
    "OUPNEUMO",
    "REINTUB",
    "FAILWEAN",
    "URNINFEC",
    "CNSCVA",
    "CDARREST",
    "CDMI",
    "OTHSYSEP",
    "OTHSESHOCK",
    "OPRENAFL",
    "RENAINSF",
    "DISCHDEST",
]
TARGET_DATA_COLS += BLOOD_COLS
## All columns we want in the df
TARGET_COLS = TARGET_ICD_COLS + TARGET_CPT_COLS + TARGET_DATA_COLS
## Add new ICD/CPT columns to columns we want in df
TARGET_COLS += list(new_icd_dict.keys())
TARGET_COLS += list(new_cpt_dict.keys())
## Make all upper-case to normalize
TARGET_COLS = [col.upper() for col in TARGET_COLS]

Extract/Filter on ICD

In [None]:
## Path to export ICD-filtered data
ICD_DIR = BASE_PATH / "data" / "raw" / "icd_filtered"

## Base file
icd_df_dict_base = create_and_filter_new_cols(
    new_col_dict=new_icd_dict,
    old_df_dict=raw_data_dict,
    export_dir=ICD_DIR,
    target_cols=TARGET_COLS,
    target_code_cols=TARGET_ICD_COLS,
    # Filter on all ICD cols
    filter_cols=list(new_icd_dict.keys()),
    cpt_flag=False,
    combine_col_name="MALIGNANT NEOPLASM",
    cols_to_combine=list(new_icd_dict.keys()),
)
TARGET_COLS = [col for col in TARGET_COLS if col not in list(new_icd_dict.keys())]
TARGET_COLS.append("MALIGNANT NEOPLASM")

Extract/Filter on CPT

In [None]:
## Directory to export CPT-filtered files
CPT_DIR = BASE_PATH / "data" / "raw" / "cpt_filtered"

## Base file
# Organize cols to filter on
cpt_filter_cols = [
    "Partial Glossectomy (Hemiglossectomy_Subtotal)",
    "Composite_Extended Glossectomy",
    "Total Glossectomy (Complete Tongue Removal)",
    "Excision of Tongue Lesions (Minor)",
]
# Create + filter
cpt_df_dict_base = create_and_filter_new_cols(
    new_col_dict=new_cpt_dict,
    old_df_dict=icd_df_dict_base,
    export_dir=CPT_DIR,
    target_cols=TARGET_COLS,
    target_code_cols=TARGET_CPT_COLS,
    # Filter on subset of CPT cols
    filter_cols=cpt_filter_cols,
    cpt_flag=True,
)