# Model Development

## Set Up

Import libraries/packages + preprocessed data

In [None]:
import sys

sys.path.append("../")
from src.config import BASE_PATH
from src.data_utils import get_data
from src.tune_base import (
    lr_model_builder_stage1,
    lightgbm_model_builder_stage1,
    xgb_model_builder_stage1,
    knn_model_builder_stage1,
    svc_model_builder_stage1,
    nn_model_builder_stage1,
    tune_models,
    get_prelim_results,
)

Import data/globals

In [None]:
file_dir = BASE_PATH / "data" / "processed"

DATA_DICT = {"base": get_data(is_nomo=False), "nomo": get_data(is_nomo=True)}

## Dont make max timeout below 15s
MODEL_CONFIG = {
    "lr": {
        "model_builder": lr_model_builder_stage1,
        "max_timeout": 60,
    },
    "lgbm": {
        "model_builder": lightgbm_model_builder_stage1,
        "max_timeout": 60,
    },
    "xgb": {
        "model_builder": xgb_model_builder_stage1,
        "max_timeout": 60,
    },
    "knn": {
        "model_builder": knn_model_builder_stage1,
        "max_timeout": 60,
    },
    "svc": {
        "model_builder": svc_model_builder_stage1,
        "max_timeout": 60,
    },
}

LOG_PATH = BASE_PATH / "logs" / "phase_1"
RESULT_PATH = BASE_PATH / "models" / "tune_results" / "phase_1"
NUM_TRIALS = 500
NUM_PARALLEL_TRIALS = 1

## TUNE

Parallel tune

In [None]:
from joblib import Parallel, delayed

jobs = []
for model_abrv, cfg in MODEL_CONFIG.items():
    if model_abrv == "lr":
        outcome_data = DATA_DICT["nomo"]
    else:
        outcome_data = DATA_DICT["base"]
    jobs.append(
        delayed(tune_models)(
            model_builder=cfg["model_builder"],
            model_abrv=model_abrv,
            outcome_data=outcome_data,
            scoring="roc_auc",
            log_file_path=LOG_PATH / f"{model_abrv}.log",
            save_path=RESULT_PATH / f"{model_abrv}.json",
            n_trials=NUM_TRIALS,
            n_parallel_trials=NUM_PARALLEL_TRIALS,
            timeout_per_trial=cfg["max_timeout"],
            clear_progress=True,
        )
    )

results = Parallel(n_jobs=len(MODEL_CONFIG))(jobs)

Sequential tune

In [None]:
for model_abrv, cfg in MODEL_CONFIG.items():
    if model_abrv == "lr":
        outcome_data = DATA_DICT["nomo"]
    else:
        outcome_data = DATA_DICT["base"]
    tune_models(
        model_builder=cfg["model_builder"],
        model_abrv=model_abrv,
        outcome_data=outcome_data,
        scoring="roc_auc",
        log_file_path=LOG_PATH / f"{model_abrv}.log",
        save_path=RESULT_PATH / f"{model_abrv}.json",
        n_trials=NUM_TRIALS,
        n_parallel_trials=NUM_PARALLEL_TRIALS,
        timeout_per_trial=cfg["max_timeout"],
        clear_progress=True,
    )

## Train models + get prelim results

In [None]:
model_save_dir = BASE_PATH / "models" / "phase_1_trained"
for model_abrv in MODEL_CONFIG.keys():
    model_builder = MODEL_CONFIG[model_abrv]["model_builder"]
    if model_abrv == "lr":
        outcome_data = DATA_DICT["nomo"]
    else:
        outcome_data = DATA_DICT["base"]
    get_prelim_results(
        results_path=RESULT_PATH / f"{model_abrv}.json",
        model_builder=model_builder,
        model_abrv=model_abrv,
        outcome_data=outcome_data,
        model_save_dir=model_save_dir,
    )