This is the ensemble of models we took from multiple runs, managed by Optuna. 
We first read the optuna SQLite file, for the run information of several (5,000) models. 

In [4]:
import optuna

study_name: str = "oversample_ADASYNT_3" #"TRIALS_5"
#study_name: str = "SIN_IMC_CircCint"

study = optuna.create_study(
    study_name=study_name,
    storage="sqlite:///homa_studies.db",
    directions=["maximize", "maximize"],
    sampler=optuna.samplers.TPESampler(),
    load_if_exists=True,
)

[32m[I 2023-04-27 12:05:21,557][0m Using an existing study with name 'oversample_ADASYNT_3' instead of creating a new one.[0m


In [5]:
import pandas as pd

df = pd.DataFrame()

# for n, trial in enumerate(study.best_trials):
for n, trial in enumerate(study.trials):
    srs = pd.concat([
        pd.Series(
            trial.user_attrs["shap_abnormal"], 
        ),
        pd.Series(
            trial.user_attrs["shap_healty"], 
        ),
        pd.Series(trial.values[0], index=["AUC"]) # ,"PheScore"])
        ], 
        keys = ["SHAP_Abnormal","SHAP_Healty", "StudyValues"]
    )
    df[f"Trial_{n}"] = srs 


In [6]:
lst_seeds = [ trial.params["seed"] for trial in study.trials ]
lst_values = [ {"AUC" : trial.values[0] , "Phe" : trial.values[1]} for trial in study.trials ]

df3 = pd.DataFrame(lst_values)
df3["seeds"] = pd.Series(lst_seeds)

df = df.T # Ease of use

lim_AUC : float = 0.90 # THIS IS THE LIMIT WE USE
df2 = df.where( df["StudyValues"]["AUC"] > lim_AUC ).dropna()

We package this info into DataFrames, for ease of use. 

In [7]:
# This exists for the plots below
names_rename = {
    'gender' : 'Sex',
    'BMI' : 'BMI',
    'Waist circumference' : 'Waist Circumference',
    'ATPII/AHA/IDF' : 'ATPII/AHA/IDF',
    'Phenylalax' : 'PheC',
    'Glupromx' : 'Glucose',
    'Insuprom' : 'Insulin',
    'Tyrosinax' : 'Tyrosine',
    'Alanine' : 'Alanine',
    'Aspartate' : 'Aspartate',
    'Glutamate' : 'Glutamate',
    'Leucine' : 'Leucine',
    'Ornithine' : 'Ornithine',
    'Proline' : 'Proline',
    'Tyrosine' : 'Tyrosine',
    'Free Carnitine' : 'Free Carnitine',
    'Propionylcarnitine' : 'Propionylcarnitine',
    'Isovalerylcarnitine' : 'Isovalerylcarnitine',
    'Tiglilcarnitine' : 'Tiglilcarnitine',
    'Me-Glutarylcarnitine' : 'Me-Glutarylcarnitine',
    'Decanoylcarnitine' : 'Decanoylcarnitine',
    'Tetradecanoylcarnitine' : 'Tetradecanoylcarnitine',
    '3-OH-Isovalerylcarnitine' : '3-OH-Isovalerylcarnitine',
    '3-OH-Palmitoylcarnitine' : '3-OH-Palmitoylcarnitine',
    'Linoleoilcarnitine' : 'Linoleoilcarnitine',
    'Arginine' : 'Arginine',
    'Citrulline' : 'Citrulline',
    'Glycine' : 'Glycine',
    'Methionine' : 'Methionine',
    'Phenylalanine' : 'Phenylalanine',
    'Succinylacetone' : 'Succinylacetone',
    'Valine' : 'Valine',
    'Acetylcarnitine' : 'Acetylcarnitine',
    'Butyrylcarnitine' : 'Butyrylcarnitine',
    'Glutarylcarnitine' : 'Glutarylcarnitine',
    'Hexanoylcarnitine' : 'Hexanoylcarnitine',
    'Octanoylcarnitine' : 'Octanoylcarnitine',
    'Dodecanoylcarnitine' : 'Dodecanoylcarnitine',
    'Tetradecenoylcarnitine' : 'Tetradecenoylcarnitine',
    'Palmitoylcarnitine' : 'Palmitoylcarnitine',
    'Stearoylcarnitine' : 'Stearoylcarnitine',
    '3-OH-Linoleoylcarnitine' : '3-OH-Linoleoylcarnitine',
    'PROTEIN_Avg_(G)' : 'Protein consumption',
    'Protein_natural' : 'Natural protein',
    'SP_Protein' : 'Supplemented protein',
    'FAT_avg(G)' : 'Fat consumption average',
}

In [8]:
fig = optuna.visualization.plot_pareto_front(study, target_names=["AUC","Phenylalanine Contribution"])

fig.add_vrect(x0=lim_AUC, x1=1.0, line_width=1, fillcolor="teal", opacity=0.2)

fig.update_layout(
    title_text="Pareto-front Plot", 
    yaxis_tickformat=".2f",
    template='plotly_white',
)

DPI = 320

# fig.write_image("fig/pareto_plots.svg", width=3*DPI, height=2*DPI)

fig.show()

In [9]:
# BETTER PARETO
import pandas as pd
from optuna.visualization._pareto_front import _get_pareto_front_info, _make_scatter_object, _make_marker, _make_hovertext
from typing import Sequence
from optuna.trial import FrozenTrial
from optuna.visualization._plotly_imports import go


info = _get_pareto_front_info(study)


n_targets: int = info.n_targets  
axis_order: Sequence[int]  = info.axis_order 
include_dominated_trials: bool = True
trials_with_values: Sequence[tuple[FrozenTrial, Sequence[float]]] = info.non_best_trials_with_values
hovertemplate: str = "%{text}<extra>Trial</extra>"
infeasible: bool = False
dominated_trials: bool = False

def trials_df(trials_with_values, class_name):
    x  =[values[axis_order[0]] for _, values in trials_with_values]
    y  =[values[axis_order[1]] for _, values in trials_with_values]

    df = pd.DataFrame({'x':x, 'y':y})
    df['class'] = class_name
    return df

df_best    = trials_df(info.best_trials_with_values, 'best')
df_nonbest = trials_df(info.non_best_trials_with_values, 'nonbest')
#df         = pd.concat([df_best, df_nonbest])

fig = go.Figure()

fig.add_trace(go.Scatter(x=df_nonbest ['x'], y=df_nonbest['y'], mode='markers', marker=dict(size=8,opacity=0.5, symbol='circle', line=dict(width=1, color='black')), name='Suboptimal'))
fig.add_trace(go.Scatter(x=df_best['x'], y=df_best['y'], mode='markers', marker=dict(size=8,opacity=0.5, symbol='square', line=dict(width=1, color='black')), name='Pareto frontier'))
# Define the width and height of the plot
plot_width = 800
plot_height = 400

# Customize the plot
fig.update_layout(
    #title='Two-objective optimization',
    
    xaxis_title='AUC',
    yaxis_title='Phe importance',
    font=dict(size=14),
    #plot_bgcolor='white',
    # xaxis=dict(range=[min(df['x']) - 1, max(df['x']) + 1]),
    # yaxis=dict(range=[min(df['y']) - 1, max(df['y']) + 1]),
    legend=dict(x=0.3, y=1.1, bgcolor='rgba(255, 255, 255, 0)', orientation='h'),
    width=plot_width,
    height=plot_height,
    margin=dict(l=0, r=0, t=0, b=0)
)

# Show the plot
fig.show()

In [10]:
# TODO: Re-do this in a less chaotic graph

- Y ~ _Mean SHAP (Shaply values takne from the combinatiorial approach to modeling feature importance on a system, based on the linear sum of ... ) value assigned to each feature by model_  
- X ~ _AUC_  is a measure of the fitting of the model, representing ... 

So, we take these models with AUC > 0.9. 

In [11]:
import plotly.express as px

# Just because we liked this palette
custom_colorscale = ["rgb(30, 144, 255)", "rgb(148, 0, 211)", "rgb(255, 20, 147)"]

SORTED_MEANS = df2["SHAP_Abnormal"].drop(["ATPII/AHA/IDF"], axis='columns').mean().sort_values(ascending=False)

relabels = { idx : f"{names_rename[idx]}"  for idx in SORTED_MEANS.index }

fig_abnormal = px.parallel_coordinates(df2["SHAP_Abnormal"],
    dimensions=df2["SHAP_Abnormal"].drop(["ATPII/AHA/IDF"], axis='columns').mean().sort_values(ascending=False).index[:13], 
    color="Phenylalax", 
    labels= relabels,
    color_continuous_scale=custom_colorscale,
    title = f"Model contributions -  'HOMA Abnormal' - {lim_AUC} < AUC ({df2.shape[0]} models)"
    )

fig_abnormal.update_layout(
    yaxis_tickformat=".2f",
    coloraxis_colorbar=dict(title="Phe. weight"),
)

# Save as SVG, we can save as PNG later
# fig_abnormal.write_image("fig/abnormal_ranks.svg", width=4.5*DPI, height=1.5*DPI, scale = 1.0)

fig_abnormal.show()

The base observation is that the IMC variable takes a high weight in most models, and there is a sort of bi-modal weight to the PheC.
Some models do not even utilise it, while others count for ~20% of the final weight. 

In [12]:
fig_healty = px.parallel_coordinates(df2["SHAP_Healty"],
    dimensions=df2["SHAP_Healty"].drop(["ATPII/AHA/IDF"], axis='columns').mean().sort_values(ascending=False).index[:13], 
    color="Phenylalax", 
    labels= relabels,
    color_continuous_scale=custom_colorscale,
    title = f"Model contributions -  'HOMA Healty' - {lim_AUC} < AUC ({df2.shape[0]} models)"
    )

# fig_healty.write_image("fig/healty_ranks.svg", width=4.5*DPI, height=1.5*DPI, scale = 1.0)

fig_healty.show()

Y ~ _Mean SHAP (Shaply values takne from the combinatiorial approach to modeling feature importance on a system, based on the linear sum of ... ) value assigned to each feature by model_  
X ~ _Ranking of the_  

For "healty" samples, the first variables remain the same in the first places, althought the weight assigned to BMI varies greatly. 

![](fig/panel_2302.svg)