# Feature Importance (SHAP)


### Import Packages/Libraries + Data + Models


In [None]:
import sys

sys.path.append("../")
from src.data_utils import get_data, get_models, get_feature_lists
from src.config import BASE_PATH, SEED
from src.feat_importance import get_shap_single_model
from joblib import delayed, Parallel
from src.nn_models import load_nn_clf
from shutil import rmtree
from sklearn.model_selection import train_test_split
import shap

shap.utils._general._show_progress = False

Set Globals

In [None]:
## Data
OUTCOME_DICT = {
    "surg": get_data("outcome_surg"),
    "bleed": get_data("outcome_bleed"),
    "asp": get_data("outcome_asp"),
    "mort": get_data("outcome_mort"),
    "reop": get_data("outcome_reop"),
}

## Models
model_dir = BASE_PATH / "models" / "trained"
model_prefix_list = ["lr", "lgbm", "svc", "stack"]
MODEL_DICT = {}
X_shape = OUTCOME_DICT["surg"]["X_train"].shape[1]  # same for all outcomes
for outcome in OUTCOME_DICT.keys():
    ## Base models
    MODEL_DICT[outcome] = get_models(model_prefix_list, outcome, model_dir)
    ## Neural network
    nn_import = load_nn_clf(
        data_path=BASE_PATH / "models" / "trained" / outcome / "nn.pt",
        in_dim=X_shape,
        device="cpu",
    )
    MODEL_DICT[outcome]["nn"] = nn_import

FEAT_ORDER = [
    # Pre-op
    "SEX",  # Nominal
    "RACE_NEW",  # Nominal
    "ETHNICITY_HISPANIC",  # Binary
    "INOUT",  # Binary
    "Age",  # Numerical
    "URGENCY",  # Nominal
    "BMI",
    "DIABETES",  # Binary
    "SMOKE",  # Binary
    "DYSPNEA",  # Binary
    "FNSTATUS2",
    "VENTILAT",
    "HXCOPD",
    "ASCITES",
    "HXCHF",
    "HYPERMED",
    "RENAFAIL",
    "DIALYSIS",
    "DISCANCR",
    "WNDINF",
    "STEROID",
    "WTLOSS",
    "BLEEDDIS",
    "TRANSFUS",
    "PRSEPIS",
    "PRALBUM",
    "PRWBC",
    "PRHCT",
    "PRPLATE",
    "ASACLAS",
    "OPERYR",
    # Intra-op
    "OPTIME",
    "Partial Glossectomy (Hemiglossectomy_Subtotal)",
    "Composite_Extended Glossectomy",
    "Total Glossectomy (Complete Tongue Removal)",
    "Excision of Tongue Lesions (Minor)",
    "Local_Regional Tissue Flaps for Oral Cavity Reconstruction",
    "Free Tissue Transfer (Microvascular Free Flaps) and Complex Flap Reconstruction",
    "Skin Autografts for Head and Neck Reconstruction",
    "Neck Dissection and Lymphadenectomy Procedures",
    "Alveolar Ridge and Gingival Procedures",
    "Mandibular Resection and Reconstruction Procedures",
    "Peripheral Nerve Repair and Neuroplasty",
    "Tracheostomy Procedures",
    "Gastrostomy and Esophageal Access Procedures",
    "Submandibular Gland Excision",
    "Parotid Gland Excision",
    "Laryngeal Resection and Reconstruction Procedures",
    "Pharyngeal Resection and Reconstruction Procedures",
    "Tonsillectomy and Tonsillar Region Procedures",
    "Malignant neoplasm",
]

FEAT_ORDER = [str(col.upper()) for col in FEAT_ORDER]

Ensure we got all bases covered with feat_order

In [None]:
dummy_df = OUTCOME_DICT["surg"]["X_test"][:5]
all_cols = set()
for col in dummy_df.columns:
    col_split = col.split("_")
    if len(col_split) == 1 or col_split[0] in [
        "ETHNICITY",
        "PARTIAL GLOSSECTOMY (HEMIGLOSSECTOMY",
        "COMPOSITE",
        "LOCAL",
    ]:
        all_cols.add(col)
    elif col_split[:2] == ["RACE", "NEW"]:
        all_cols.add("RACE_NEW")
    else:
        col_name = col_split[0]
        all_cols.add(col_name)
assert set(FEAT_ORDER) == set(all_cols)

## SHAP


Run in parallel

In [None]:
jobs = []
for outcome_name, outcome_data in OUTCOME_DICT.items():
    X_train = outcome_data["X_train"]
    y_train = outcome_data["y_train"]
    X_test = outcome_data["X_test"]
    y_test = outcome_data["y_test"]
    cur_model_dict = MODEL_DICT[outcome_name]
    for model_name, model in cur_model_dict.items():
        if model_name in ["stack", "nn"]:
            X_background, _, _, _ = train_test_split(
                X_train,
                y_train,
                stratify=outcome_data["y_train"],
                random_state=SEED,
                train_size=0.01,
            )
        else:
            X_background = X_train
        # ================================> CALL SHAP
        log_path = BASE_PATH / "shap_logs" / model_name / f"{outcome_name}.log"
        if log_path.exists():
            log_path.unlink()
        result_path = (
            BASE_PATH
            / "results"
            / "tables"
            / "SHAP"
            / outcome_name
            / f"{model_name}.xlsx"
        )
        if result_path.exists():
            result_path.unlink()
        jobs.append(
            delayed(get_shap_single_model)(
                model=model,
                model_name=model_name,
                feat_order=FEAT_ORDER,
                outcome_name=outcome_name,
                explanation_vals=X_test,
                background_vals=X_background,
                log_path=log_path,
                result_path=result_path,
            )
        )

In [None]:
len(jobs)

In [None]:
# Run jobs with 25 (5 outcomes*5 models) parallel workers
print("=== Starting jobs on CPU ===")
Parallel(n_jobs=min(25, len(jobs)), backend="loky")(jobs)