In [1]:
import numpy as np
import pandas as pd
import xgboost
import yaml
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
import shap
from sklearn.model_selection import StratifiedKFold


class generate_model:
    def __init__(
        self,
        predictor_col: str,
        data_path: str,
        xg_params: dict,
        kfold_splits: int,
        test_size=0.3,
        removed_features: list[str] = None,
        seed: float = None,
    ) -> None:
        if removed_features is None:
            removed_features = []
        dataset = pd.read_csv(data_path)

        # Variable encoding
        dataset["Género"] = (
            dataset["Género"].replace({"M": 0, "F": 1}).astype("category")
        )
        dataset["ATPII/AHA/IDF"] = (
            dataset["ATPII/AHA/IDF"].replace({"no": 0, "si": 1}).astype("category")
        )
        dataset["aleator"] = (
            dataset["aleator"]
            .replace({"Control": 0, "PKU 1": 1, "PKU 2": 2})
            .astype("category")
        )

        y_df = dataset[predictor_col].replace({"No": 0, "Si": 1}).astype("category")
        X_df = dataset.drop(predictor_col, axis="columns")

        X_df = X_df.drop(removed_features, axis="columns")

        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(
            X_df, y_df, stratify=y_df, test_size=test_size, random_state=seed
        )
        # def xg_train(self, xg_params: dict, kfold_splits=5, seed=None):

        cv = StratifiedKFold(n_splits=kfold_splits, shuffle=True, random_state=seed)
        folds = list(cv.split(self.X_train, self.y_train))

        for train_idx, val_idx in folds:
            # Sub-empaquetado del train-set en formato de XGBoost
            dtrain = xgboost.DMatrix(
                self.X_train.iloc[train_idx, :],
                label=self.y_train.iloc[train_idx],
                enable_categorical=True,
            )
            dval = xgboost.DMatrix(
                self.X_train.iloc[val_idx, :],
                label=self.y_train.iloc[val_idx],
                enable_categorical=True,
            )

            self.model = xgboost.train(
                dtrain=dtrain,
                params=xg_params,
                evals=[(dtrain, "train"), (dval, "val")],
                num_boost_round=1000,
                verbose_eval=False,
                early_stopping_rounds=10,
            )

    def get_AUC_on_test_data(self):
        # def xg_test(model, X_test, y_test) -> float:
        testset = xgboost.DMatrix(
            self.X_test, label=self.y_test, enable_categorical=True
        )
        y_preds = self.model.predict(testset)

        return roc_auc_score(testset.get_label(), y_preds)

    def get_feature_metrics(self):
        # internal_feature_metrics = pd.DataFrame(
        #     {
        #         "Weight": self.model.get_score(importance_type="weight"),
        #         "Coverage": self.model.get_score(importance_type="cover"),
        #         "Gain": self.model.get_score(importance_type="gain"),
        #     }
        # ).sort_values(by="Gain", ascending=False)

        explainer = shap.TreeExplainer(self.model)

        # Extrae la explicacion SHAP en un DF
        EXPLAINATION = explainer(self.X_test).cohorts(
            self.y_test.replace({0: "Healty", 1: "Abnormal"}).tolist()
        )
        cohort_exps = list(EXPLAINATION.cohorts.values())

        exp_shap_abnormal = pd.DataFrame(
            cohort_exps[0].values, columns=cohort_exps[0].feature_names
        )  # .abs().mean()

        exp_shap_healty = pd.DataFrame(
            cohort_exps[1].values, columns=cohort_exps[1].feature_names
        )  # .abs().mean()

        feature_metrics = pd.concat(
            {
                "SHAP_healty": exp_shap_healty.abs().mean(),
                "SHAP_abnormal": exp_shap_abnormal.abs().mean(),
            },
            axis="columns",
        )

        # feature_metrics = (
        #     feature_metrics.join(internal_feature_metrics)
        #     .fillna(0)
        #     .sort_values(by="Gain", ascending=False)
        # )

        return feature_metrics


with open("params.yml", "r") as f:
    ext_params = yaml.load(f, Loader=yaml.FullLoader)


def objective(trial, finetunning: bool = False) -> float:
    """
    The function that runs a single model and evaluates it.
    """

#    seed = 42 #trial.suggest_int("seed", 1, 10_000)

    # params={
    #         "objective":   "binary:logistic",
    #         "eval_metric": "logloss",
    #         'max_depth':   trial.suggest_int("max_depth", 2, 6, ),
    #         "eta":         trial.suggest_float("eta", 0.01, 0.3),
    #         "subsample":   trial.suggest_float("subsample", 0.5, 0.9),
    #         "lambda": trial.suggest_float("lambda", 0, 1),
    #         "alpha": trial.suggest_float("alpha",0,1),
    #         "scale_pos_weight": trial.suggest_float("scale_pos_weight",0,2)
    #     }


    seed = trial.suggest_int("seed", 1, 10_000)


    params = {
        "objective": "binary:logistic",
        "eval_metric": "logloss",
'max_depth' :          3,
'eta' :                0.272876,
'subsample':           0.694101,
'lambda' :             0.169166,
'alpha':               0.115721,
'scale_pos_weight':    0.320643,

    }

    model_instance = generate_model(
        "HOMA-IR alterado",
        "_best_artifact/optuna/resampled_data_BorderlineSMOTE.csv",
        removed_features=ext_params["feature_engineering"]["removed_features"],
        xg_params=params,
        kfold_splits=5,
        seed=seed,
    )

    return (
        model_instance.get_AUC_on_test_data(),
        model_instance.get_feature_metrics()["SHAP_abnormal"]["fenilalax"],
    )

In [40]:
import optuna

study_name: str = "alejandro_resampled_data_BorderlineSMOTE"

study = optuna.create_study(
    #study_name=study_name,
    #storage="sqlite:///alejandro.db",
    directions=["maximize", "maximize"],
    # sampler=optuna.samplers.TPESampler(),  # NSGAIISampler(),
    sampler=optuna.samplers.NSGAIISampler(), # NSGAIISampler(),
    load_if_exists=True,
)

study.optimize(objective, n_trials=1_000, n_jobs=-1)
# Sampling may take a long time. Other samples are faster but tend to be hiperfixed and thus generate repeated results.

[32m[I 2023-05-03 14:00:09,681][0m A new study created in memory with name: no-name-1b25dcc6-a989-42d7-b41d-54271b006465[0m
[32m[I 2023-05-03 14:00:11,828][0m Trial 1 finished with values: [0.8333333333333334, 0.0] and parameters: {'seed': 5077}. [0m
[32m[I 2023-05-03 14:00:11,895][0m Trial 15 finished with values: [1.0, 0.06753042340278625] and parameters: {'seed': 6151}. [0m
[32m[I 2023-05-03 14:00:11,905][0m Trial 2 finished with values: [0.9166666666666666, 0.2080060839653015] and parameters: {'seed': 840}. [0m
[32m[I 2023-05-03 14:00:11,907][0m Trial 9 finished with values: [0.9722222222222223, 0.0] and parameters: {'seed': 6636}. [0m
[32m[I 2023-05-03 14:00:11,958][0m Trial 5 finished with values: [0.8854166666666667, 0.0] and parameters: {'seed': 4103}. [0m
[32m[I 2023-05-03 14:00:11,958][0m Trial 19 finished with values: [0.8298611111111112, 0.0] and parameters: {'seed': 9331}. [0m
[32m[I 2023-05-03 14:00:12,015][0m Trial 14 finished with values: [0.95833

In [41]:
import pandas as pd
from optuna.visualization._pareto_front import (
    _get_pareto_front_info,
    _make_scatter_object,
    _make_marker,
    _make_hovertext,
)
from typing import Sequence
from optuna.trial import FrozenTrial
from optuna.visualization._plotly_imports import go


info = _get_pareto_front_info(study)

n_targets: int = info.n_targets
axis_order: Sequence[int] = info.axis_order
include_dominated_trials: bool = True
trials_with_values: Sequence[
    tuple[FrozenTrial, Sequence[float]]
] = info.non_best_trials_with_values
hovertemplate: str = "%{text}<extra>Trial</extra>"
infeasible: bool = False
dominated_trials: bool = False


def trials_df(trials_with_values, class_name):
    x = [values[axis_order[0]] for _, values in trials_with_values]
    y = [values[axis_order[1]] for _, values in trials_with_values]

    df = pd.DataFrame({"x": x, "y": y})
    df["class"] = class_name
    return df


df_best = trials_df(info.best_trials_with_values, "best")
df_nonbest = trials_df(info.non_best_trials_with_values, "nonbest")
# df         = pd.concat([df_best, df_nonbest])

fig = go.Figure()

fig.add_trace(
    go.Scatter(
        x=df_nonbest["x"],
        y=df_nonbest["y"],
        mode="markers",
        marker=dict(
            size=8, opacity=0.5, symbol="circle", line=dict(width=1, color="black")
        ),
        name="Suboptimal",
    )
)
fig.add_trace(
    go.Scatter(
        x=df_best["x"],
        y=df_best["y"],
        mode="markers",
        marker=dict(
            size=8, opacity=0.5, symbol="square", line=dict(width=1, color="black")
        ),
        name="Pareto frontier",
    )
)
# Define the width and height of the plot
plot_width = 900
plot_height = 400
axis_label_font_size = 19

# Customize the plot
fig.update_layout(
    xaxis=dict(
        title=dict(
            text="Model performance (AUC)", font=dict(size=axis_label_font_size)
        ),
        range=[-0.05, 1.05],
    ),
    yaxis=dict(
        title=dict(
            text="Phe importance (Shapley value)", font=dict(size=axis_label_font_size)
        ),
        range=[-0.05, 0.85],
    ),
    font=dict(size=18),
    # plot_bgcolor='white',
    legend=dict(x=0.3, y=1.1, bgcolor="rgba(255, 255, 255, 0)", orientation="h"),
    width=plot_width,
    height=plot_height,
    margin=dict(l=0, r=0, t=0, b=0),
)
x_max = max(df_nonbest["x"].max(), df_best["x"].max())

fig.update_layout(
    # ... (previous settings)
    shapes=[
        dict(
            type="rect",
            xref="x",
            yref="paper",
            x0=0.8,
            x1=x_max,
            y0=0.0,
            y1=1,
            fillcolor="darkgreen",
            opacity=0.20,
            layer="below",
            line_width=0,
        )
    ],
    # ... (previous settings)
)


import plotly.io as pio

dpi = 300  # Set the desired resolution (dots per inch)
output_filename = f"pareto_{study_name}.png"
pio.write_image(fig, output_filename, format="png", scale=dpi / 96)

# Show the plot
fig.show()

In [42]:
# Extract the best trials from the multi-objective study
best_trials = study.best_trials

# Print information about the best trials
for i, trial in enumerate(best_trials):
    print(f"Best trial {i+1} number: {trial.number}")
    print(f"Best trial {i+1} values: {trial.values}")
    print(f"Best trial {i+1} parameters: {trial.params}")
    print()

Best trial 1 number: 43
Best trial 1 values: [0.9027777777777778, 0.6335051655769348]
Best trial 1 parameters: {'seed': 8998}

Best trial 2 number: 370
Best trial 2 values: [0.986111111111111, 0.5854418873786926]
Best trial 2 parameters: {'seed': 316}

Best trial 3 number: 431
Best trial 3 values: [1.0, 0.465649276971817]
Best trial 3 parameters: {'seed': 5045}

Best trial 4 number: 435
Best trial 4 values: [0.9791666666666666, 0.6262168288230896]
Best trial 4 parameters: {'seed': 5870}



# Ranking plots

In [55]:
# Create a empty data frame for


bs_trials = pd.DataFrame(
    columns=list(study.best_trials[0].params.keys())+['auc', 'phe'])

# hardcoded_params = {
hardcoded_params = {
        "objective": "binary:logistic",
        "eval_metric": "logloss",
'max_depth' :          3,
'eta' :                0.272876,
'subsample':           0.694101,
'lambda' :             0.169166,
'alpha':               0.115721,
'scale_pos_weight':    0.320643,

    }

CUTOFF_AUC = 0.80
CUTOFF_PHE = 0.20

for t in study.get_trials():#[:99]:  # [:499]:
    if t.values[0] > CUTOFF_AUC and t.values[1] > CUTOFF_PHE:
        #bs_trials.loc[t.number] = list(t.params.values()) + list(t.values)
        bs_trials.loc[t.number] = (
            [3, 0.272876, 0.694101, 0.169166, 0.169166,0.115721, 0.320643]
            + list(t.params.values())
            + [0.749761]
            + list(t.values)
        )

# for t in study.best_trials:
#     bs_trials.loc[t.number] = list(t.params.values()) + list(t.values)
    # and saves them to rows in a dataframe

bs_trials.info()

ValueError: cannot set a row with mismatched columns

In [44]:
bs_trials = bs_trials.drop_duplicates()
bs_trials.describe().loc["50%"]

seed    5457.500000
auc        0.982639
phe        0.605829
Name: 50%, dtype: float64

In [46]:
import plotly.express as px

# fig = px.parallel_coordinates(bs_trials)

fig = px.parallel_coordinates(
    bs_trials,
    color="auc",
    # dimensions=[
    #     "seed",
    #     "alpha",
    #     "eta",
    #     "lambda",
    #     "max_depth",
    #     "scale_pos_weight",
    #     "subsample",
    #     "phe",
    #     "auc",
    # ],
)
fig.show()

In [47]:
import plotly.express as px

fig_boxplot = px.box(
    bs_trials,#.drop("seed", axis="columns"),
    points="all",
    facet_col="variable",
)

fig_boxplot.update_xaxes(matches=None)
fig_boxplot.update_yaxes(matches=None)
fig_boxplot.for_each_yaxis(lambda yaxis: yaxis.update(showticklabels=True))
fig_boxplot.show()

In [48]:
bs_trials["seed"], bs_trials["max_depth"] = bs_trials["seed"].astype("int"), bs_trials[
    "max_depth"
].astype("int")
bs_trials.head()

KeyError: 'max_depth'

In [49]:
params = {"objective": "binary:logistic", "eval_metric": "logloss"}


params.update(bs_trials.drop(["auc", "phe"], axis="columns").iloc[1].to_dict())

params

{'objective': 'binary:logistic', 'eval_metric': 'logloss', 'seed': 316.0}

In [50]:
bs_trials

Unnamed: 0,seed,auc,phe
43,8998.0,0.902778,0.633505
370,316.0,0.986111,0.585442
431,5045.0,1.0,0.465649
435,5870.0,0.979167,0.626217


In [51]:
paretos = {}

df_ranks = pd.DataFrame()

for number in bs_trials.index:
    instance_params = bs_trials.loc[number][
        ["max_depth", "eta", "subsample", "lambda", "alpha", "scale_pos_weight"]
    ].to_dict()
    instance_params["max_depth"] = int(instance_params["max_depth"])

    instance_params["objective"] = "binary:logistic"
    instance_params["eval_metric"] = "logloss"

    print(instance_params)

    model_instance = generate_model(
        "HOMA-IR alterado",
        "data.csv",
        removed_features=ext_params["feature_engineering"]["removed_features"],
        xg_params=instance_params,
        kfold_splits=5,
        seed=int(bs_trials.loc[number]["seed"]),
    )
    #
    print(model_instance.model.attributes())

    df_ranks[number] = model_instance.get_feature_metrics()["SHAP_abnormal"]

    print(
        "Phe_value : ",
        model_instance.get_feature_metrics()["SHAP_abnormal"]["fenilalax"],
        "\t",
        "Phe_ranking : ",
        model_instance.get_feature_metrics()["SHAP_abnormal"].rank(ascending=False)[
            "fenilalax"
        ],
    )

    print("\n")

KeyError: "None of [Index(['max_depth', 'eta', 'subsample', 'lambda', 'alpha', 'scale_pos_weight'], dtype='object')] are in the [index]"

In [79]:
# RELABELING
labels_relabel = {
    "Género": "gender",
    "aleator": "random",
    "Edad": "Age",
    "Peso": "Weight",
    "Estatura": "Height",
    "IMC": "BMI",
    "Circunferencia de cintura": "Waist circumference",
    "ATPII/AHA/IDF": "ATPII/AHA/IDF",
    "fenilalax": "Phenylalax",
    "glupromx": "Glupromx",
    "glummol": "Glummol",
    "insuprom": "Insuprom",
    "HOMA-IR alterado": "HOMA-IR altered",
    "HOMA2-IR": "HOMA2-IR",
    "HOMA2B(%)": "HOMA2B(%)",
    "HOMA2S%": "HOMA2S%",
    "quickix": "Quickix",
    "ohd3x": "ohd3x",
    "tirosinax": "Tyrosinax",
    "Alanina": "Alanine",
    "Aspartato": "Aspartate",
    "Glutamato": "Glutamate",
    "Leucina": "Leucine",
    "Ornitina": "Ornithine",
    "Prolina": "Proline",
    "Tirosina": "Tyrosine",
    "Carnitina libre": "Free Carnitine",
    "Propionilcarnitina": "Propionylcarnitine",
    "Isovalerilcarnitina": "Isovalerylcarnitine",
    "Tiglilcarnitina": "Tiglilcarnitine",
    "Me-Glutarilcarnitina": "Me-Glutarylcarnitine",
    "Decanoilcarnitina": "Decanoylcarnitine",
    "Tetradecanoilcarnitina": "Tetradecanoylcarnitine",
    "3-OH-Isovalerilcarnitina": "3-OH-Isovalerylcarnitine",
    "3-OH-Palmitoilcarnitina": "3-OH-Palmitoylcarnitine",
    "Linoleoilcarnitina": "Linoleoilcarnitine",
    "Arginina": "Arginine",
    "Citrulina": "Citrulline",
    "Glicina": "Glycine",
    "Metionina": "Methionine",
    "Fenilalanina": "Phenylalanine",
    "Succinilacetona": "Succinylacetone",
    "Valina": "Valine",
    "Acetilcarnitina": "Acetylcarnitine",
    "Butirilcarnitina": "Butyrylcarnitine",
    "Glutarilcarnitina": "Glutarylcarnitine",
    "Hexanoilcarnitina": "Hexanoylcarnitine",
    "Octanoilcarnitina": "Octanoylcarnitine",
    "Dodecanoilcarnitina": "Dodecanoylcarnitine",
    "Tetradecenoilcarnitina": "Tetradecenoylcarnitine",
    "Palmitoilcarnitina": "Palmitoylcarnitine",
    "Estearoilcarnitina": "Stearoylcarnitine",
    "3-OH-Linoleoilcarnitina": "3-OH-Linoleoylcarnitine",
    "PROTEINAProm_(G)": "Protein avg. (g)",
    "Proteina_natural": "Protein natural",
    "%_proteina_natural": "% Natural Protein",
    "Proteina_SP": "SP Protein",
    "SP_gr/kg": "SP (gr/kg)",
    "%_Proteina_SP": "% SP Protein",
    "GRASAProm(G)": "Fat avg. (g)",
    "CARBOHIDRATOProm_(G)": "CARBOHYDRATE avg. (g)",
    "ENERGIAProm_(KCAL)": "ENERGY avg. (KCAL)",
    "COLESTEROLProm_(MG)": "CHOLESTEROL avg. (mg)",
    "FENILALANINAProm_(G)": "PHENYLALANINE_avg (g)",
    "TIROSINAProm_(G)": "TYROSINE avg. (g)",
    "VITAMINA_B12Prom_(MCG)": "Vitamin B12 avg. (MCG)",
    "FOLATOProm_(MCG)": "Folate avg. (MCG)",
    "CALCIOProm_(MG)": "Calcium avg. (mg)",
    "COBREProm_(MG)": "Copper avg. (mg)",
    "HIERROProm_(MG)": "Iron avg. (mg)",
    "ZINCProm_(MG)": "Zinc avg. (mg)",
    "VITAMINA_Dprom": "Vitamin D avg.",
}

In [80]:
df_ranks_long = df_ranks.T
order = (
    df_ranks_long.rank(axis="columns", ascending=False)
    .median(axis="rows")
    .sort_values()
    .index
)

# Create a grouped barplot using Plotly
fig_barplot = px.bar(
    df_ranks,
    barmode="group",
    template="ggplot2",
)
fig_barplot.update_xaxes(
    categoryorder="array", categoryarray=order, labelalias=labels_relabel
)


fig_barplot.update_layout(
    xaxis_title="Feature in the dataset",
    yaxis_title="Shapley Value importance",
    title="Shapley values for abnormal samples",
    legend_title="Model",
)

# Show the plot
fig_barplot.show()

# fig = px.bar(, x='variable', y='value')
# fig.show()

In [81]:
df_ranks_long["auc_score"] = bs_trials["auc"]

In [82]:
fig = px.parallel_coordinates(
    df_ranks_long,
    # color='fenilalax',
    color="auc_score",
    dimensions=order.tolist()[:10] + ["auc_score"],
    labels=labels_relabel,
    title="Shapley values for abnormal samples - ADASYNT",
)

fig.update_layout(
    coloraxis_showscale=False,
    # title="Feature importance",
    xaxis_title="Feature in the dataset",
    yaxis_title="Shapley Value importance",
    # legend_title="Shapley values for abnormal samples",
)

fig.show()