# Permutation Importance

## Set Up

Libraries/packages

In [1]:
import sys

sys.path.append("../")
from src.data_utils import get_data, get_models, get_feature_lists
from src.config import BASE_PATH, SEED
from src.feat_importance import plot_perm_single_model
from joblib import delayed, Parallel
print(f"Root path: {BASE_PATH}")
from sklearn.model_selection import train_test_split

Globals

In [2]:
## Path to eval results
SAVE_PATH = BASE_PATH / "results"
# Data
file_dir = BASE_PATH / "data" / "processed"
OUTCOME_DICT = {
    "med": get_data("med_outcome", file_dir),
    "surg": get_data("surg_outcome", file_dir),
    "mort": get_data("mort_outcome", file_dir),
    "reop": get_data("reop_outcome", file_dir),
    "vte": get_data("vte_outcome", file_dir),
}

# Models
model_dir = BASE_PATH / "models" / "calibrated"
model_prefix_list = ["lgbm", "lr", "xgb", "nn", "stack"]
MODEL_DICT = {}
for outcome in OUTCOME_DICT.keys():
    MODEL_DICT[outcome] = get_models(model_prefix_list, outcome, model_dir)

## Run permutation

Sequentially

In [3]:
# for outcome_name, outcome_data in OUTCOME_DICT.items():
#     save_dir = BASE_PATH / "results" / "figures" / outcome_name
#     X_test = outcome_data["X_test"]
#     y_test = outcome_data["y_test"]
#     ## Use only ~22k patients
#     _, X_sub, _, y_sub = train_test_split(X_test,y_test, stratify = y_test, test_size=0.2)
#     cur_model_dict = MODEL_DICT[outcome_name]
#     for model_name, model in cur_model_dict.items():
#         if model_name != 'stack':
#             continue
#         plot_perm_single_model(
#             model_name=model_name,
#             model=model,
#             outcome_name=outcome_name,
#             X=X_sub,
#             y=y_sub,
#             log_path=BASE_PATH / "perm_logs"/ model_name/f"{outcome_name}.log",
#             n_repeats=2,
#             result_path=BASE_PATH / "figures"/ model_name,
#             show_output=True,
#             scoring="roc_auc",
#             rand_state = SEED,
#             batch_size=1,
#         )

Parallel

In [None]:
jobs = []
for outcome_name, outcome_data in OUTCOME_DICT.items():
    save_dir = BASE_PATH / "results" / "figures" / outcome_name
    X_test = outcome_data["X_test"]
    y_test = outcome_data["y_test"]
    ## Use only ~16k patients
    _, X_sub, _, y_sub = train_test_split(X_test,y_test, stratify = y_test, test_size=0.15)
    cur_model_dict = MODEL_DICT[outcome_name]
    for model_name, model in cur_model_dict.items():
        # if model_name != 'stack':
        #     continue
        jobs.append(
            delayed(plot_perm_single_model)(
                model_name=model_name,
                model=model,
                outcome_name=outcome_name,
                X=X_sub,
                y=y_sub,
                log_path=BASE_PATH / "perm_logs" / model_name / f"{outcome_name}.log",
                n_repeats=200,
                result_path=BASE_PATH / "results"/ "figures" / model_name,
                show_output=True,
                scoring="roc_auc",
                rand_state=SEED,
                batch_size=5, 
            )
        )
assert len(jobs) == 5

In [None]:
# Run jobs with 25 parallel workers
print("=== Starting jobs on CPU ===")
Parallel(n_jobs=min(25, len(jobs)), backend="loky")(jobs)