In [1]:
import warnings
warnings.filterwarnings("ignore")
from utils import *
import pickle


def get_studies(a_study):
    

    with open(f'{a_study}.pickle', 'rb') as handle:
        studies = pickle.load(handle)

    return studies    
chile = get_studies("studies3_chile")
rome  = get_studies("studies3_rome")
flor  = get_studies("studies3_flor")

In [2]:
plot_all_studies(chile, .75, 0, 'chile')
plot_all_studies(rome, .75, 0, 'rome') 
plot_all_studies(flor, .75, 0, 'flor') 


In [3]:

import warnings
warnings.filterwarnings("ignore")
from utils import *

df_chile = pd.read_parquet("data/df_chile.parquet.gzip")
df_rome  = pd.read_parquet("data/df_rome.parquet.gzip")
df_flor  = pd.read_parquet("data/df_flor.parquet.gzip")


df_chile['Alpha-Fet'] = df_chile['Alpha-Fet'].astype('category')
df_rome['Alpha-Fet'] = df_rome['Alpha-Fet'].astype('category')
df_flor['Alpha-Fet'] = df_flor['Alpha-Fet'].astype('category')


In [4]:
from tqdm import tqdm
from pandas.errors import PerformanceWarning



def ranking_boxplots(
                    studies ,       
                    df_test ,      
                    df_train ,     
                    binary_target ,
                    Independent_testset ,
                    title,
                    ):
    filtered_trials = [
                        trial 
                        for study in studies
                        for trial in TwoObjectiveSolutions(study, auc_cutoff=0.75, s_cutoff=0).get_filtered_trials().filtered_trials
                    ]

    import warnings
    from pandas.errors import PerformanceWarning


    warnings.filterwarnings("ignore", category=PerformanceWarning)

    import copy
    AUCs = []
    df_ranks_abnormal = pd.DataFrame()
    df_ranks_healty = pd.DataFrame()

    with warnings.catch_warnings():
        warnings.filterwarnings("ignore", category=PerformanceWarning)
        for i, trial in tqdm(enumerate(copy.deepcopy(filtered_trials)), total=len(filtered_trials)):
            # print(trial.values)

            params_i = trial.params
            seed, kfold_splits, xgb_params = params_i.pop('seed'), params_i.pop('kfold_splits'), params_i

            model_instance = ModelInstance(
                df = df_train, 
                target = binary_target,
                xgb_params = xgb_params,
                kfold_splits = kfold_splits,
                Independent_testset = Independent_testset,
                Independent_testset_df = df_test,
                seed = seed,
                
            )
            AUCs.append(model_instance.get_AUC_on_test_data())
            feature_metrics      = model_instance.get_feature_explanation()
            df_ranks_abnormal[i] = feature_metrics["SHAP_abnormal"]
            df_ranks_healty[i]   = feature_metrics["SHAP_healty"]

    import plotly.express as px
    import numpy as np

    df_ranks_long = df_ranks_abnormal.T
    order = df_ranks_long.rank(axis="columns", ascending=False).median(axis="rows").sort_values().index
    #df_ranks_long["auc_score"] = AUCs
    df = df_ranks_long
    import plotly.graph_objects as go

    # Calculate the median of each variable and sort the columns in descending order
    median_values = df.median().sort_values(ascending=False)
    sorted_columns = median_values.index

    # Create a list to store the box traces
    box_traces = []

    # Iterate over each sorted column
    for column in sorted_columns:
        # Create a box trace for each variable
        box_trace = go.Box(
            y=df[column],
            name=column,
            marker_color='gray',  # Set the marker color to gray
            showlegend=False  # Hide the legend
        )
        # Add the box trace to the list
        box_traces.append(box_trace)

    # Create the layout
    layout = go.Layout(title=title,
        yaxis=dict(
            title='Importance'
        ),
        xaxis=dict(
            tickangle=45,  # Rotate the x-axis labels by 45 degrees
        ),
        autosize=False,  # Turn off autosizing
        width=800,  # Set the width of the figure
        height=300,  # Set the height of the figure
        margin=dict(
            l=0,  # Set left margin to 0
            r=0,  # Set right margin to 0
            t=30,  # Set top margin to 0
            b=0  # Set bottom margin to 0
        )
    )

    # Create the figure
    fig = go.Figure(data=box_traces, layout=layout)

    # Show the figure
    fig.show()
    


ranking_boxplots(studies = chile,df_test = df_chile, df_train = df_chile, binary_target = 'Alpha-Fet', Independent_testset= False, title = "Chile")
ranking_boxplots(studies = rome, df_test = df_rome, df_train = df_chile, binary_target = 'Alpha-Fet',Independent_testset= True, title = "Rome")
ranking_boxplots(studies = flor, df_test = df_flor, df_train = df_chile, binary_target = 'Alpha-Fet',Independent_testset= True, title = "Florence")


100%|██████████| 128/128 [00:57<00:00,  2.21it/s]


100%|██████████| 29/29 [00:14<00:00,  1.93it/s]


100%|██████████| 61/61 [00:34<00:00,  1.74it/s]


In [13]:
import plotly.express as px
import numpy as np

# df_ranks_long = df_ranks_abnormal.T
# order = df_ranks_long.rank(axis="columns", ascending=False).median(axis="rows").sort_values().index
df_ranks_long["auc_score"] = AUCs

#fig_barplot = px.bar(df_ranks_abnormal, barmode="group")

def rank_sort(df_ranks_abnormal: pd.DataFrame) -> pd.Index:
    """This takes the dataframe, ranks its variables, and sorts them by the mean ranking"""
    return df_ranks_abnormal.rank(axis="rows").median(axis="columns").sort_values(ascending=False).index

order = rank_sort(df_ranks_abnormal) # Converts ranks to an ordered list

def wrapper_pcoorplot(df_ranks_abnormal, dims=order.tolist()):
    """Small wrapper around plotly parallel coordinates plot"""
    fig_ranks = px.parallel_coordinates(
        df_ranks_abnormal.T,
        dimensions=dims,
        color=df_ranks_long["auc_score"],
        # labels=labels_relabel,
        color_continuous_scale='BuGn',
    )

    # Calculate global min and max
    global_min = np.min(df_ranks_abnormal.values)
    global_max = np.max(df_ranks_abnormal.values)

    # Create a new 'range' key for each dimension in the figure
    for dimension in fig_ranks.data[0]['dimensions']:
        dimension['range'] = [global_min, global_max]

    fig_ranks.update_layout(
        yaxis_tickformat=".2f",
        coloraxis_colorbar=dict(title=f"{'AUC'} (test data)"),
    )

    return fig_ranks

# Creates the figure with the first [:15] features according to ranking
fig_abnormal = wrapper_pcoorplot(df_ranks_abnormal, order.tolist()[:10])
fig_abnormal.show()  # Show fig, as this is a notebook
