In [161]:
import numpy as np
import pandas as pd
import xgboost
import yaml
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
import shap
from sklearn.model_selection import StratifiedKFold

class generate_model():
    
    def __init__(self, predictor_col: str, data_path: str,xg_params: dict, kfold_splits : int , test_size=0.3, removed_features: list[str] = None, seed: float = None) -> None:
        if removed_features is None:
            removed_features = []
        dataset = pd.read_csv(data_path)

        # Variable encoding
        dataset["Género"] = dataset["Género"].replace({"M": 0, "F": 1}).astype("category")
        dataset["ATPII/AHA/IDF"] = (
            dataset["ATPII/AHA/IDF"].replace({"no": 0, "si": 1}).astype("category")
        )
        dataset["aleator"] = (
            dataset["aleator"]
            .replace({"Control": 0, "PKU 1": 1, "PKU 2": 2})
            .astype("category")
        )

        y_df = dataset[predictor_col].replace({"No": 0, "Si": 1}).astype("category")
        X_df = dataset.drop(predictor_col, axis="columns")

        X_df = X_df.drop(removed_features, axis="columns")

        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(
                                                                X_df, y_df, stratify=y_df, test_size=test_size, random_state=seed
                                                            )
    #def xg_train(self, xg_params: dict, kfold_splits=5, seed=None):

        cv = StratifiedKFold(n_splits=kfold_splits, shuffle=True, random_state=seed)
        folds = list(cv.split(self.X_train, self.y_train))

        for train_idx, val_idx in folds:
            # Sub-empaquetado del train-set en formato de XGBoost
            dtrain = xgboost.DMatrix(
                self.X_train.iloc[train_idx, :],
                label=self.y_train.iloc[train_idx],
                enable_categorical=True,
            )
            dval = xgboost.DMatrix(
                self.X_train.iloc[val_idx, :],
                label=self.y_train.iloc[val_idx],
                enable_categorical=True,
            )

            self.model = xgboost.train(
                dtrain=dtrain,
                params=xg_params,
                evals=[(dtrain, "train"), (dval, "val")],
                num_boost_round=1000,
                verbose_eval=False,
                early_stopping_rounds=10,
            )
    def get_AUC_on_test_data(self):
        #def xg_test(model, X_test, y_test) -> float:
        testset = xgboost.DMatrix(self.X_test, label=self.y_test, enable_categorical=True)
        y_preds = self.model.predict(testset)

        return roc_auc_score(testset.get_label(), y_preds)
    
    
    def get_feature_metrics(self):
        # internal_feature_metrics = pd.DataFrame(
        #     {
        #         "Weight": self.model.get_score(importance_type="weight"),
        #         "Coverage": self.model.get_score(importance_type="cover"),
        #         "Gain": self.model.get_score(importance_type="gain"),
        #     }
        # ).sort_values(by="Gain", ascending=False)

        explainer = shap.TreeExplainer(self.model)

        # Extrae la explicacion SHAP en un DF
        EXPLAINATION = explainer(self.X_test).cohorts(
            self.y_test.replace({0: "Healty", 1: "Abnormal"}).tolist()
        )
        cohort_exps = list(EXPLAINATION.cohorts.values())

        exp_shap_abnormal = pd.DataFrame(
            cohort_exps[0].values, columns=cohort_exps[0].feature_names
        )  # .abs().mean()

        exp_shap_healty = pd.DataFrame(
            cohort_exps[1].values, columns=cohort_exps[1].feature_names
        )  # .abs().mean()


        feature_metrics = pd.concat(
            {
                "SHAP_healty": exp_shap_healty.abs().mean(),
                "SHAP_abnormal": exp_shap_abnormal.abs().mean(),
            },
            axis="columns",
        )

        # feature_metrics = (
        #     feature_metrics.join(internal_feature_metrics)
        #     .fillna(0)
        #     .sort_values(by="Gain", ascending=False)
        # )

        return feature_metrics
    
      
    
with open("params.yml", "r") as f:
        ext_params = yaml.load(f, Loader=yaml.FullLoader)
   
   
def objective(trial) -> float:
    """
    The function that runs a single model and evaluates it.
    """

    seed = trial.suggest_int("seed", 1, 10_000)

    params={"objective":   "binary:logistic",
            "eval_metric": "logloss",
            'max_depth':   trial.suggest_int("max_depth", 2, 6, ),
            "eta":         trial.suggest_float("eta", 0.01, 0.3),
            "subsample":   trial.suggest_float("subsample", 0.5, 0.9),
            "lambda": trial.suggest_float("lambda", 0, 1),
            "alpha": trial.suggest_float("alpha",0,1),
            "scale_pos_weight": trial.suggest_float("scale_pos_weight",0,2)
        }
                
    model_instance = generate_model("HOMA-IR alterado", "data.csv", removed_features = ext_params["feature_engineering"]["removed_features"], xg_params = params, kfold_splits = 5, seed=seed)   
    
    return model_instance.get_AUC_on_test_data(), model_instance.get_feature_metrics()["SHAP_abnormal"]["fenilalax"]



In [162]:
import optuna
study_name: str = "alejandro_3"

study = optuna.create_study(
    study_name=study_name,
    storage="sqlite:///homa_studies.db",
    directions=["maximize", "maximize"],
    sampler=optuna.samplers.TPESampler(), # NSGAIISampler(),
    load_if_exists=True,
)

study.optimize(objective, n_trials=10_000, n_jobs=-1)


[32m[I 2023-05-02 16:08:12,921][0m A new study created in RDB with name: alejandro_3[0m
[32m[I 2023-05-02 16:08:18,336][0m Trial 0 finished with values: [0.611111111111111, 0.0] and parameters: {'seed': 9115, 'max_depth': 3, 'eta': 0.12861081176109074, 'subsample': 0.7410904863899215, 'lambda': 0.7519983420965978, 'alpha': 0.6671775379771234, 'scale_pos_weight': 1.6683159420004525}. [0m
[32m[I 2023-05-02 16:08:18,346][0m Trial 7 finished with values: [0.75, 0.17103703320026398] and parameters: {'seed': 2928, 'max_depth': 5, 'eta': 0.295788422108401, 'subsample': 0.5276406107021067, 'lambda': 0.544383461280257, 'alpha': 0.6076372033806389, 'scale_pos_weight': 1.9661109100381071}. [0m
[32m[I 2023-05-02 16:08:18,796][0m Trial 12 finished with values: [0.6944444444444444, 0.0] and parameters: {'seed': 6413, 'max_depth': 5, 'eta': 0.26145730805478157, 'subsample': 0.6339498510740691, 'lambda': 0.9397283350604044, 'alpha': 0.841349165489574, 'scale_pos_weight': 0.5347311669162413}

In [None]:
import pandas as pd
from optuna.visualization._pareto_front import _get_pareto_front_info, _make_scatter_object, _make_marker, _make_hovertext
from typing import Sequence
from optuna.trial import FrozenTrial
from optuna.visualization._plotly_imports import go


info = _get_pareto_front_info(study)


n_targets: int = info.n_targets  
axis_order: Sequence[int]  = info.axis_order 
include_dominated_trials: bool = True
trials_with_values: Sequence[tuple[FrozenTrial, Sequence[float]]] = info.non_best_trials_with_values
hovertemplate: str = "%{text}<extra>Trial</extra>"
infeasible: bool = False
dominated_trials: bool = False

def trials_df(trials_with_values, class_name):
    x  =[values[axis_order[0]] for _, values in trials_with_values]
    y  =[values[axis_order[1]] for _, values in trials_with_values]

    df = pd.DataFrame({'x':x, 'y':y})
    df['class'] = class_name
    return df

df_best    = trials_df(info.best_trials_with_values, 'best')
df_nonbest = trials_df(info.non_best_trials_with_values, 'nonbest')
#df         = pd.concat([df_best, df_nonbest])

fig = go.Figure()

fig.add_trace(go.Scatter(x=df_nonbest ['x'], y=df_nonbest['y'], mode='markers', marker=dict(size=8,opacity=0.5, symbol='circle', line=dict(width=1, color='black')), name='Suboptimal'))
fig.add_trace(go.Scatter(x=df_best['x'], y=df_best['y'], mode='markers', marker=dict(size=8,opacity=0.5, symbol='square', line=dict(width=1, color='black')), name='Pareto frontier'))
# Define the width and height of the plot
plot_width = 800
plot_height = 400

# Customize the plot
fig.update_layout(
    #title='Two-objective optimization',
    
    xaxis_title='AUC',
    yaxis_title='Phe importance',
    font=dict(size=14),
    #plot_bgcolor='white',
    # xaxis=dict(range=[min(df['x']) - 1, max(df['x']) + 1]),
    # yaxis=dict(range=[min(df['y']) - 1, max(df['y']) + 1]),
    legend=dict(x=0.3, y=1.1, bgcolor='rgba(255, 255, 255, 0)', orientation='h'),
    width=plot_width,
    height=plot_height,
    margin=dict(l=0, r=0, t=0, b=0)
)

# Show the plot
fig.show()

In [None]:
# # Extract the best trials from the multi-objective study
# best_trials = study.best_trials
# 
# # Print information about the best trials
# for i, trial in enumerate(best_trials):
#     print(f"Best trial {i+1} number: {trial.number}")
#     print(f"Best trial {i+1} values: {trial.values}")
#     print(f"Best trial {i+1} parameters: {trial.params}")
#     print()


# Ranking plots

In [None]:
study.best_trials

[FrozenTrial(number=4377, state=TrialState.COMPLETE, values=[0.9615384615384616, 0.6442433595657349], datetime_start=datetime.datetime(2023, 5, 2, 15, 30, 10, 111358), datetime_complete=datetime.datetime(2023, 5, 2, 15, 30, 17, 250616), params={'seed': 5169, 'max_depth': 3, 'eta': 0.23535740338108185, 'subsample': 0.694765658809831, 'lambda': 0.2072832956537327, 'alpha': 0.05436494132607317, 'scale_pos_weight': 1.6986442009922793}, user_attrs={}, system_attrs={'nsga2:generation': 71}, intermediate_values={}, distributions={'seed': IntDistribution(high=10000, log=False, low=1, step=1), 'max_depth': IntDistribution(high=6, log=False, low=2, step=1), 'eta': FloatDistribution(high=0.3, log=False, low=0.01, step=None), 'subsample': FloatDistribution(high=0.9, log=False, low=0.5, step=None), 'lambda': FloatDistribution(high=1.0, log=False, low=0.0, step=None), 'alpha': FloatDistribution(high=1.0, log=False, low=0.0, step=None), 'scale_pos_weight': FloatDistribution(high=2.0, log=False, low=0

In [None]:
# Create a empty data frame for 
bs_trials = pd.DataFrame(columns=[
 'seed',
 'max_depth',
 'eta',
 'subsample',
 'lambda',
 'alpha',
 'scale_pos_weight',
 'auc', 'phe'
])


for t in study.best_trials:
    bs_trials.loc[t.number] = list(t.params.values()) + list(t.values)
    # and saves them to rows in a dataframe

bs_trials.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 20 entries, 4377 to 4962
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   seed              20 non-null     float64
 1   max_depth         20 non-null     float64
 2   eta               20 non-null     float64
 3   subsample         20 non-null     float64
 4   lambda            20 non-null     float64
 5   alpha             20 non-null     float64
 6   scale_pos_weight  20 non-null     float64
 7   auc               20 non-null     float64
 8   phe               20 non-null     float64
dtypes: float64(9)
memory usage: 1.6 KB


In [None]:
bs_trials = bs_trials.drop_duplicates()
bs_trials.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8 entries, 4377 to 4850
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   seed              8 non-null      float64
 1   max_depth         8 non-null      float64
 2   eta               8 non-null      float64
 3   subsample         8 non-null      float64
 4   lambda            8 non-null      float64
 5   alpha             8 non-null      float64
 6   scale_pos_weight  8 non-null      float64
 7   auc               8 non-null      float64
 8   phe               8 non-null      float64
dtypes: float64(9)
memory usage: 640.0 bytes


In [None]:
import plotly.express as px
#fig = px.parallel_coordinates(bs_trials)

fig = px.parallel_coordinates(
      bs_trials, 
      color="auc",
      dimensions=[
            'seed',
            'alpha',
            'eta',
            'lambda',
            'max_depth',
            'scale_pos_weight',
            'subsample',
            "phe", 
            "auc"
      ],)
fig.show()

In [None]:
import plotly.express as px

fig_boxplot = px.box(
    bs_trials.drop("seed", axis="columns"), 
    points="all", 
    facet_col="variable", 
)

fig_boxplot.update_xaxes(matches=None)
fig_boxplot.update_yaxes(matches=None)
fig_boxplot.for_each_yaxis(lambda yaxis: yaxis.update(showticklabels=True))
fig_boxplot.show()

In [None]:
bs_trials['seed'], bs_trials['max_depth'] = bs_trials['seed'].astype('int'), bs_trials['max_depth'].astype('int')
bs_trials.head()


Unnamed: 0,seed,max_depth,eta,subsample,lambda,alpha,scale_pos_weight,auc,phe
4377,5169,3,0.235357,0.694766,0.207283,0.054365,1.698644,0.961538,0.644243
4421,5169,3,0.253002,0.759269,0.207283,0.076526,1.696173,1.0,0.609401
4436,5169,6,0.235357,0.694766,0.207283,0.054365,1.698644,0.961538,0.644243
4460,5169,2,0.235357,0.694766,0.207283,0.054365,1.698644,0.961538,0.644243
4479,5169,5,0.235357,0.694766,0.207283,0.054365,1.698644,0.961538,0.644243


In [None]:
params ={"objective":   "binary:logistic",
            "eval_metric": "logloss"}


params.update(
    bs_trials.drop(["auc","phe"], axis="columns").iloc[1].to_dict()
)

params

{'objective': 'binary:logistic',
 'eval_metric': 'logloss',
 'seed': 5169.0,
 'max_depth': 3.0,
 'eta': 0.25300181253009474,
 'subsample': 0.7592694807222635,
 'lambda': 0.2072832956537327,
 'alpha': 0.07652617012564833,
 'scale_pos_weight': 1.696173408025679}

In [None]:
bs_trials

Unnamed: 0,seed,max_depth,eta,subsample,lambda,alpha,scale_pos_weight,auc,phe
4377,5169,3,0.235357,0.694766,0.207283,0.054365,1.698644,0.961538,0.644243
4421,5169,3,0.253002,0.759269,0.207283,0.076526,1.696173,1.0,0.609401
4436,5169,6,0.235357,0.694766,0.207283,0.054365,1.698644,0.961538,0.644243
4460,5169,2,0.235357,0.694766,0.207283,0.054365,1.698644,0.961538,0.644243
4479,5169,5,0.235357,0.694766,0.207283,0.054365,1.698644,0.961538,0.644243
4730,5169,3,0.211323,0.694766,0.207283,0.054365,1.698644,0.923077,0.767981
4794,5169,2,0.211323,0.694766,0.207283,0.054365,1.698644,0.923077,0.767981
4850,5169,4,0.211323,0.694766,0.207283,0.054365,1.698644,0.923077,0.767981


In [None]:

paretos = {}


for number in bs_trials.index: 


    instance_params = bs_trials.loc[number][['max_depth','eta','subsample','lambda','alpha','scale_pos_weight']].to_dict()
    instance_params['max_depth'] = int(instance_params['max_depth'])

    instance_params['objective'] = 'binary:logistic'
    instance_params['eval_metric'] = 'logloss'

    print(instance_params)

    model_instance = generate_model(
        "HOMA-IR alterado", "data.csv", 
        removed_features = ext_params["feature_engineering"]["removed_features"], 
        xg_params = instance_params,
        kfold_splits = 5, 
        seed = int(bs_trials.loc[number]['seed'])
    )   
# 
    print(
        model_instance.model.attributes()
    )
    
    print(
        'Phe_value : ', model_instance.get_feature_metrics()["SHAP_abnormal"]["fenilalax"], "\t",
        'Phe_ranking : ',model_instance.get_feature_metrics()["SHAP_abnormal"].rank(ascending=False)["fenilalax"]
    )

    print('\n')





{'max_depth': 3, 'eta': 0.23535740338108185, 'subsample': 0.694765658809831, 'lambda': 0.2072832956537327, 'alpha': 0.05436494132607317, 'scale_pos_weight': 1.6986442009922793, 'objective': 'binary:logistic', 'eval_metric': 'logloss'}
{'best_iteration': '13', 'best_ntree_limit': '14', 'best_score': '0.3996595727900664'}
Phe_value :  0.64424336 	 Phe_ranking :  1.0


{'max_depth': 3, 'eta': 0.25300181253009474, 'subsample': 0.7592694807222635, 'lambda': 0.2072832956537327, 'alpha': 0.07652617012564833, 'scale_pos_weight': 1.696173408025679, 'objective': 'binary:logistic', 'eval_metric': 'logloss'}
{'best_iteration': '7', 'best_ntree_limit': '8', 'best_score': '0.2976595498621464'}
Phe_value :  0.60940063 	 Phe_ranking :  1.0


{'max_depth': 6, 'eta': 0.23535740338108185, 'subsample': 0.694765658809831, 'lambda': 0.2072832956537327, 'alpha': 0.05436494132607317, 'scale_pos_weight': 1.6986442009922793, 'objective': 'binary:logistic', 'eval_metric': 'logloss'}
{'best_iteration': '13', 'bes

In [None]:
pd.DataFrame(paretos).T