# Prepare Models and other data for Deployment on Streamlit

Import models from ML workflow and export into app/ dir

## Set Up

In [None]:
import sys

sys.path.append("../")
from src.config import BASE_PATH, SEED
from src.data_utils import get_data, get_models
from app.config import CHOSEN_MODEL_DICT  # type: ignore
from app import shap_utils
import numpy as np
import pandas as pd
import joblib
from shutil import rmtree
from sklearn.model_selection import train_test_split
import shap

## Imports/Exports

Initialize paths

In [None]:
## MODELS
cal_model_output_dir = BASE_PATH / "app" / "models"
if cal_model_output_dir.exists():
    rmtree(cal_model_output_dir)
cal_model_output_dir.mkdir(exist_ok=False, parents=True)

## PIPELINES
pipe_output_dir = BASE_PATH / "app" / "preprocessors"
if pipe_output_dir.exists():
    rmtree(pipe_output_dir)
pipe_output_dir.mkdir(exist_ok=False, parents=True)

## ALL PREDS
pred_output_dir = BASE_PATH / "app" / "all_preds"
if pred_output_dir.exists():
    rmtree(pred_output_dir)
pred_output_dir.mkdir(exist_ok=False, parents=True)

## BIN THRESHOLDS
threshold_output_dir = BASE_PATH / "app" / "bin_thresholds"
if threshold_output_dir.exists():
    rmtree(threshold_output_dir)
threshold_output_dir.mkdir(exist_ok=False, parents=True)

## SHAP explainers
shap_output_dir = BASE_PATH / "app" / "shap_explainers"
if shap_output_dir.exists():
    rmtree(shap_output_dir)
shap_output_dir.mkdir(exist_ok=False, parents=True)

Import/Export to app dir

In [None]:
## GET OUTCOME PIPELINES, CHOSEN MODELS, AND ASSOCIATED PREDS/THRESHOLDS
for outcome_name, chosen_model in CHOSEN_MODEL_DICT.items():
    print(f"Outcome: {outcome_name}, model: {chosen_model}")
    # ================> Pipelines (shared by all models)
    pipeline = joblib.load(
        BASE_PATH / "data" / "preprocessors" / f"outcome_{outcome_name}_pipeline.joblib"
    )
    joblib.dump(pipeline, pipe_output_dir / f"{outcome_name}_pipeline.joblib")
    # ================> MODELS
    # Calibrated models for predictions
    cal_model = joblib.load(
        BASE_PATH / "cal_models" / outcome_name / f"{chosen_model}.joblib"
    )
    joblib.dump(
        cal_model, cal_model_output_dir / f"{outcome_name}_{chosen_model}.joblib"
    )
    # ================> All Preds
    test_preds = pd.read_parquet(
        BASE_PATH
        / "results"
        / "app"
        / "all_preds"
        / outcome_name
        / f"{chosen_model}.parquet"
    )
    test_preds.to_parquet(pred_output_dir / f"{outcome_name}_{chosen_model}.parquet")
    # ================> Bin Thresholds
    bin_thresholds = np.load(
        BASE_PATH
        / "results"
        / "app"
        / "bin_thresholds"
        / outcome_name
        / f"{chosen_model}.npz"
    )
    np.savez(
        threshold_output_dir / f"{outcome_name}_{chosen_model}.npz",
        thresholds=bin_thresholds["thresholds"],
    )
    # ================> SHAP explainer
    X_test = pd.read_parquet(
        BASE_PATH / "data" / "processed" / f"outcome_{outcome_name}" / "X_test.parquet"
    )
    un_cal_model = joblib.load(
        BASE_PATH / "models" / "trained" / outcome_name / f"{chosen_model}.joblib"
    )

    if chosen_model == "lgbm":
        print("Using TreeExplainer for LightGBM...")
        explainer = shap.TreeExplainer(
            model=un_cal_model,
            data=X_test,
            feature_perturbation="interventional",
            model_output="raw",
            feature_names=X_test.columns.tolist(),
        )

    elif chosen_model in ["lr", "svc"]:
        print("Using LinearExplainer...")
        mean = X_test.mean(axis=0).values
        cov = np.cov(X_test.T)
        explainer = shap.LinearExplainer(
            model=un_cal_model,
            masker=(mean, cov),
        )

    else:
        raise ValueError(f"Unrecognized model {chosen_model}")

    joblib.dump(explainer, shap_output_dir / f"{outcome_name}.joblib")

# Save feature names
feature_names = X_test.columns.tolist()
joblib.dump(feature_names, shap_output_dir / "feature_names.joblib")