In [1]:
import numpy as np
import pandas as pd
import xgboost
import yaml
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
import shap
from sklearn.model_selection import StratifiedKFold

class generate_model():
    
    def __init__(self, predictor_col: str, data_path: str,xg_params: dict, kfold_splits : int , test_size=0.3, removed_features: list[str] = None, seed: float = None) -> None:
        if removed_features is None:
            removed_features = []
        dataset = pd.read_csv(data_path)

        # Variable encoding
        dataset["Género"] = dataset["Género"].replace({"M": 0, "F": 1}).astype("category")
        dataset["ATPII/AHA/IDF"] = (
            dataset["ATPII/AHA/IDF"].replace({"no": 0, "si": 1}).astype("category")
        )
        dataset["aleator"] = (
            dataset["aleator"]
            .replace({"Control": 0, "PKU 1": 1, "PKU 2": 2})
            .astype("category")
        )

        y_df = dataset[predictor_col].replace({"No": 0, "Si": 1}).astype("category")
        X_df = dataset.drop(predictor_col, axis="columns")

        X_df = X_df.drop(removed_features, axis="columns")

        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(
                                                                X_df, y_df, stratify=y_df, test_size=test_size, random_state=seed
                                                            )
    #def xg_train(self, xg_params: dict, kfold_splits=5, seed=None):

        cv = StratifiedKFold(n_splits=kfold_splits, shuffle=True, random_state=seed)
        folds = list(cv.split(self.X_train, self.y_train))

        for train_idx, val_idx in folds:
            # Sub-empaquetado del train-set en formato de XGBoost
            dtrain = xgboost.DMatrix(
                self.X_train.iloc[train_idx, :],
                label=self.y_train.iloc[train_idx],
                enable_categorical=True,
            )
            dval = xgboost.DMatrix(
                self.X_train.iloc[val_idx, :],
                label=self.y_train.iloc[val_idx],
                enable_categorical=True,
            )

            self.model = xgboost.train(
                dtrain=dtrain,
                params=xg_params,
                evals=[(dtrain, "train"), (dval, "val")],
                num_boost_round=1000,
                verbose_eval=False,
                early_stopping_rounds=10,
            )
    def get_AUC_on_test_data(self):
        #def xg_test(model, X_test, y_test) -> float:
        testset = xgboost.DMatrix(self.X_test, label=self.y_test, enable_categorical=True)
        y_preds = self.model.predict(testset)

        return roc_auc_score(testset.get_label(), y_preds)
    
    
    def get_feature_metrics(self):
        internal_feature_metrics = pd.DataFrame(
            {
                "Weight": self.model.get_score(importance_type="weight"),
                "Coverage": self.model.get_score(importance_type="cover"),
                "Gain": self.model.get_score(importance_type="gain"),
            }
        ).sort_values(by="Gain", ascending=False)

        explainer = shap.TreeExplainer(self.model)

        # Extrae la explicacion SHAP en un DF
        EXPLAINATION = explainer(self.X_test).cohorts(
            self.y_test.replace({0: "Healty", 1: "Abnormal"}).tolist()
        )
        cohort_exps = list(EXPLAINATION.cohorts.values())

        exp_shap_abnormal = pd.DataFrame(
            cohort_exps[0].values, columns=cohort_exps[0].feature_names
        )  # .abs().mean()

        exp_shap_healty = pd.DataFrame(
            cohort_exps[1].values, columns=cohort_exps[1].feature_names
        )  # .abs().mean()


        feature_metrics = pd.concat(
            {
                "SHAP_healty": exp_shap_healty.abs().mean(),
                "SHAP_abnormal": exp_shap_abnormal.abs().mean(),
            },
            axis="columns",
        )

        feature_metrics = (
            feature_metrics.join(internal_feature_metrics)
            .fillna(0)
            .sort_values(by="Gain", ascending=False)
        )

        return feature_metrics
    
      
    
with open("params.yml", "r") as f:
        ext_params = yaml.load(f, Loader=yaml.FullLoader)
   
   
def objective(trial) -> float:
    """
    The function that runs a single model and evaluates it.
    """
    params = {
            "seed": trial.suggest_int("seed", 1, 10_000),
            #"kfold_splits": trial.suggest_int("kfold_splits", 3, 5),
            # XGB_hiperparams
            'max_depth': trial.suggest_int("max_depth", 2, 10),
            "eta": trial.suggest_float("eta", 0.01, 0.3),
            "subsample": trial.suggest_float("subsample", 0.5, 0.9),
            # "booster": trial.suggest_categorical("booster", ["gbtree", "dart"]),
            "objective": "binary:logistic",
            "eval_metric": "logloss",
        }
                
    model_instance = generate_model("HOMA-IR alterado", "data.csv", removed_features = ext_params["feature_engineering"]["removed_features"], xg_params = params, kfold_splits = 5, seed=params["seed"])   
    
    return model_instance.get_AUC_on_test_data(), model_instance.get_feature_metrics()["SHAP_abnormal"]["fenilalax"]



In [None]:
import optuna
study_name: str = "oversample_SMOTE"

study = optuna.create_study(
    study_name=study_name,
    storage="sqlite:///homa_studies.db",
    directions=["maximize", "maximize"],
    sampler=optuna.samplers.NSGAIISampler(),
    load_if_exists=True,
)

study.optimize(objective, n_trials=500, n_jobs=-1)


In [8]:
import pandas as pd
from optuna.visualization._pareto_front import _get_pareto_front_info, _make_scatter_object, _make_marker, _make_hovertext
from typing import Sequence
from optuna.trial import FrozenTrial
from optuna.visualization._plotly_imports import go


info = _get_pareto_front_info(study)


n_targets: int = info.n_targets  
axis_order: Sequence[int]  = info.axis_order 
include_dominated_trials: bool = True
trials_with_values: Sequence[tuple[FrozenTrial, Sequence[float]]] = info.non_best_trials_with_values
hovertemplate: str = "%{text}<extra>Trial</extra>"
infeasible: bool = False
dominated_trials: bool = False

def trials_df(trials_with_values, class_name):
    x  =[values[axis_order[0]] for _, values in trials_with_values]
    y  =[values[axis_order[1]] for _, values in trials_with_values]

    df = pd.DataFrame({'x':x, 'y':y})
    df['class'] = class_name
    return df

df_best    = trials_df(info.best_trials_with_values, 'best')
df_nonbest = trials_df(info.non_best_trials_with_values, 'nonbest')
#df         = pd.concat([df_best, df_nonbest])

fig = go.Figure()

fig.add_trace(go.Scatter(x=df_nonbest ['x'], y=df_nonbest['y'], mode='markers', marker=dict(size=8,opacity=0.5, symbol='circle', line=dict(width=1, color='black')), name='Suboptimal'))
fig.add_trace(go.Scatter(x=df_best['x'], y=df_best['y'], mode='markers', marker=dict(size=8,opacity=0.5, symbol='square', line=dict(width=1, color='black')), name='Pareto frontier'))
# Define the width and height of the plot
plot_width = 800
plot_height = 400

# Customize the plot
fig.update_layout(
    #title='Two-objective optimization',
    
    xaxis_title='AUC',
    yaxis_title='Phe importance',
    font=dict(size=14),
    #plot_bgcolor='white',
    # xaxis=dict(range=[min(df['x']) - 1, max(df['x']) + 1]),
    # yaxis=dict(range=[min(df['y']) - 1, max(df['y']) + 1]),
    legend=dict(x=0.3, y=1.1, bgcolor='rgba(255, 255, 255, 0)', orientation='h'),
    width=plot_width,
    height=plot_height,
    margin=dict(l=0, r=0, t=0, b=0)
)

# Show the plot
fig.show()

In [4]:
# Extract the best trials from the multi-objective study
best_trials = study.best_trials

# Print information about the best trials
for i, trial in enumerate(best_trials):
    print(f"Best trial {i+1} number: {trial.number}")
    print(f"Best trial {i+1} values: {trial.values}")
    print(f"Best trial {i+1} parameters: {trial.params}")
    print()


Best trial 1 number: 63
Best trial 1 values: [0.9230769230769231, 0.27638572454452515]
Best trial 1 parameters: {'kfold_splits': 4, 'learning_rate': 0.2933073187615315, 'seed': 7173}

Best trial 2 number: 183
Best trial 2 values: [0.8461538461538461, 0.3588626980781555]
Best trial 2 parameters: {'eta': 0.12650514455495748, 'max_depth': 8, 'seed': 519, 'subsample': 0.743822414749286}

Best trial 3 number: 691
Best trial 3 values: [0.846153846153846, 0.46692225337028503]
Best trial 3 parameters: {'eta': 0.16361017587073823, 'max_depth': 9, 'seed': 6890, 'subsample': 0.6086136503633341}

Best trial 4 number: 751
Best trial 4 values: [0.7692307692307692, 0.5015076398849487]
Best trial 4 parameters: {'eta': 0.15868174944621136, 'max_depth': 8, 'seed': 3901, 'subsample': 0.87857187122177}

Best trial 5 number: 1031
Best trial 5 values: [0.2777777777777778, 0.6634809970855713]
Best trial 5 parameters: {'seed': 5474, 'max_depth': 5, 'eta': 0.2689014207979416, 'subsample': 0.7936735664234864}

