# Tune LR, LightGBM, XGBoost models w/ optuna

## Set Up

Import packages/libraries/ project globals

In [None]:
import sys

sys.path.append("../")
from src.config import BASE_PATH
from src.data_utils import get_data
from src.tune_base import (
    lightgbm_model_builder,
    tune_model_mult_outcomes,
    lr_model_builder,
    xgb_model_builder,
    get_prelim_results,
)

Import data/ notebook globals

In [None]:
file_dir = BASE_PATH / "data" / "processed"

OUTCOME_DICT = {
    "med": get_data("med_outcome", file_dir),
    "surg": get_data("surg_outcome", file_dir),
    "mort": get_data("mort_outcome", file_dir),
    "reop": get_data("reop_outcome", file_dir),
    "vte": get_data("vte_outcome", file_dir),
}

## Dont make max timeout below 15s
MODEL_CONFIG = {
    "lr": {"model_builder": lr_model_builder, "max_timeout": 300},
    "lgbm": {
        "model_builder": lightgbm_model_builder,
        "max_timeout": 360,
    },
    "xgb": {
        "model_builder": xgb_model_builder,
        "max_timeout": 600,
    },
}

LOG_PATH = BASE_PATH / "logs" / "base_2"
RESULT_PATH = BASE_PATH / "models" / "tune_results_2"
NUM_TRIALS = 150
NUM_PARALLEL_TRIALS = 1

## TUNE

Single-run

In [None]:
# model_abrv = ""
# tune_model_mult_outcomes(
#     model_builder=MODEL_CONFIG[model_abrv]["model_builder"],
#     model_abrv=model_abrv,
#     outcome_dict=OUTCOME_DICT,
#     scoring="roc_auc",
#     log_file_path=LOG_PATH / f"{model_abrv}.log",
#     save_path=RESULT_PATH / f"{model_abrv}.json",
#     n_trials=NUM_TRIALS,
#     n_parallel_trials=NUM_PARALLEL_TRIALS,
#     timeout_per_trial=MODEL_CONFIG[model_abrv]["max_timeout"],
#     clear_progress=True,
# )

Sequential

In [None]:
# for model_abrv in MODEL_CONFIG.keys():
#     tune_model_mult_outcomes(
#         model_builder=MODEL_CONFIG[model_abrv]["model_builder"],
#         model_abrv=model_abrv,
#         outcome_dict=OUTCOME_DICT,
#         scoring="roc_auc",
#         log_file_path=LOG_PATH / f"{model_abrv}.log",
#         save_path=RESULT_PATH / f"{model_abrv}.json",
#         n_trials=NUM_TRIALS,
#         n_parallel_trials=NUM_PARALLEL_TRIALS,
#         timeout_per_trial=MODEL_CONFIG[model_abrv]["max_timeout"],
#         clear_progress=True,
#     )

Parallel

In [None]:
from joblib import Parallel, delayed

jobs = []
for model_abrv, cfg in MODEL_CONFIG.items():
    if model_abrv != "lr":
        continue
    jobs.append(
        delayed(tune_model_mult_outcomes)(
            model_builder=cfg["model_builder"],
            model_abrv=model_abrv,
            outcome_dict=OUTCOME_DICT,
            scoring="roc_auc",
            log_file_path=LOG_PATH / f"{model_abrv}.log",
            save_path=RESULT_PATH / f"{model_abrv}.json",
            n_trials=NUM_TRIALS,
            n_parallel_trials=NUM_PARALLEL_TRIALS,
            timeout_per_trial=cfg["max_timeout"],
            clear_progress=True,
        )
    )

results = Parallel(n_jobs=len(MODEL_CONFIG))(jobs)

## Train models + get prelim results

In [None]:
model_save_dir = BASE_PATH / "models" / "trained"
for model_abrv in MODEL_CONFIG.keys():
    model_builder = MODEL_CONFIG[model_abrv]["model_builder"]
    get_prelim_results(
        results_path=RESULT_PATH / f"{model_abrv}.json",
        model_builder=model_builder,
        model_abrv=model_abrv,
        outcome_dict=OUTCOME_DICT,
        model_save_dir=model_save_dir,
    )