In [46]:
import pandas as pd
import numpy as np  
from dash_website import ALGORITHMS_RENDERING
from dash_website.xwas import BAR_PLOT_TABLE_COLUMNS, FEATURES_CORRELATIONS_TABLE_COLUMNS

category = "Alcohol"
dimension = "Abdomen"

data_features = pd.read_feather(f"../../data/xwas/multivariate_feature_importances/dimension_category/features_{dimension}_{category}.feather").to_dict()
data_scores = pd.read_feather(f"../../data/xwas/multivariate_results/scores.feather", columns=["category", "dimension", "r2", "std", "algorithm"]).to_dict()

In [47]:
dimension

'Abdomen'

In [20]:
import plotly.graph_objects as go

scores_raw = pd.DataFrame(data_scores).set_index(["dimension", "category"])
if (dimension, category) in scores_raw.index:
    scores = scores_raw.loc[dimension, category]
    best_algorithm = scores.iloc[scores["r2"].argmax()]["algorithm"]

    scores_algorithm = scores.reset_index().set_index("algorithm").round(3)
    title = f"R² : Elastic Net {scores_algorithm.loc['elastic_net', 'r2']} +- {scores_algorithm.loc['elastic_net', 'std']}, "
    title += f"Light GBM {scores_algorithm.loc['light_gbm', 'r2']} +- {scores_algorithm.loc['light_gbm', 'std']}, Neural Network {scores_algorithm.loc['neural_network', 'r2']} +- {scores_algorithm.loc['neural_network', 'std']}"
else:
    scores = None
    best_algorithm = "light_gbm"
    title = "We don't have the scores for the moment, they are going to be uploaded soon."

features = pd.DataFrame(data_features).set_index(["algorithm", "variable"])
sorted_variables = (features.loc[best_algorithm].abs() / features.loc[best_algorithm].abs().sum()).sort_values(by=["feature_importance"], ascending=False).index

algorithms = features.index.get_level_values("algorithm").drop_duplicates()

table_features = pd.DataFrame(None, columns=BAR_PLOT_TABLE_COLUMNS.keys())
table_features["variable"] = sorted_variables

for algorithm in algorithms:
    sorted_algorithm_variable = [[algorithm, variable] for variable in sorted_variables]
   
    percentage_importance = (features.loc[sorted_algorithm_variable].abs() / features.loc[sorted_algorithm_variable].abs().sum())["feature_importance"]

    table_features[f"feature_{algorithm}"] = features.loc[sorted_algorithm_variable].values
    table_features[f"percentage_{algorithm}"] = percentage_importance.values
title

"We don't have the scores for the moment, they are going to be uploaded soon."

In [17]:
bars = []
hovertemplate = "Variable: %{y} <br>Percentage of overall feature importance: %{x:.3f} <br>Feature importance: %{customdata:.3f} <br><extra></extra>"

for algorithm in algorithms:
    bars.append(go.Bar(name=ALGORITHMS_RENDERING[algorithm], x=table_features[f"percentage_{algorithm}"], y=sorted_variables, orientation="h", customdata=table_features[f"feature_{algorithm}"], hovertemplate=hovertemplate))


fig = go.Figure(bars)

fig.update_layout(
    {
        "width": 1000,
        "height": 800,
        "xaxis": {"title": "Percentage of overall feature importance", "showgrid": False},
        "yaxis": {"title": "Variables", "showgrid": False},
    }
)

fig.show()

In [23]:
table_correlations = table_features[[f"percentage_{'correlation'}", f"percentage_{'elastic_net'}", f"percentage_{'light_gbm'}", f"percentage_{'neural_network'}"]]

table_correlations.corr(method="spearman").round(3).rename(index=FEATURES_CORRELATIONS_TABLE_COLUMNS).reset_index().rename(columns=FEATURES_CORRELATIONS_TABLE_COLUMNS)

Unnamed: 0,Unnamed: 1,Percentage Correlation,Percentage Elastic Net,Percentage Light GBM,Percentage Neural Network
0,Percentage Correlation,1.0,0.344,0.163,0.297
1,Percentage Elastic Net,0.344,1.0,0.073,0.62
2,Percentage Light GBM,0.163,0.073,1.0,0.052
3,Percentage Neural Network,0.297,0.62,0.052,1.0


In [24]:
scores

Unnamed: 0_level_0,Unnamed: 1_level_0,r2,std,algorithm
dimension,category,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
HeartECG,BloodCount,0.033416,0.028906,elastic_net
HeartECG,BloodCount,0.031667,0.029463,light_gbm
HeartECG,BloodCount,0.029556,0.028107,neural_network


In [26]:
scores_algorithm = scores.reset_index().set_index("algorithm").round(3)
f"R² : Elastic Net {scores_algorithm.loc['elastic_net', 'r2']} +- {scores_algorithm.loc['elastic_net', 'std']}, Light GBM {scores_algorithm.loc['light_gbm', 'r2']} +- {scores_algorithm.loc['light_gbm', 'std']}, Neural Network {scores_algorithm.loc['neural_network', 'r2']} +- {scores_algorithm.loc['neural_network', 'std']}"

'R² : Elastic Net 0.033 +- 0.029, Light GBM 0.032 +- 0.029, Neural Network 0.03 +- 0.028'

In [28]:
scores[dimension, category]

Unnamed: 0_level_0,Unnamed: 1_level_0,r2,std,algorithm
dimension,category,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
HeartECG,BloodCount,0.033416,0.028906,elastic_net
HeartECG,BloodCount,0.031667,0.029463,light_gbm
HeartECG,BloodCount,0.029556,0.028107,neural_network


In [60]:
def split_if_necessary(category):
    if "medical_diagnoses" in category:
        return category
    else:
        return category.split("_")[1]
category_scores = pd.read_csv("../../Scores_ElasticNet_test.csv")["env_dataset"].drop_duplicates().apply(split_if_necessary).tolist()

In [66]:
from dash_website import MAIN_CATEGORIES_TO_CATEGORIES

missing_cats = []
for cat in MAIN_CATEGORIES_TO_CATEGORIES["All"]:
    if cat not in category_scores:
        missing_cats.append(cat)

for cat in category_scores:
    if cat not in MAIN_CATEGORIES_TO_CATEGORIES["All"]:
        print(cat)

HeartSize
AnthropometryImpedance
AnthropometryBodySize
Claudification


In [68]:
missing_cats

['Anthropometry',
 'BloodBiochemistry',
 'Claudication',
 'CognitiveFluidIntelligence',
 'CognitiveMatrixPatternCompletion',
 'CognitiveNumericMemory',
 'CognitivePairedAssociativeLearning',
 'CognitivePairsMatching',
 'CognitiveProspectiveMemory',
 'CognitiveReactionTime',
 'CognitiveSymbolDigitSubstitution',
 'CognitiveTowerRearranging',
 'CognitiveTrailMaking',
 'Genetics',
 'HeartFunction',
 'Impedance',
 'Phenotypic',
 'PhysicalActivity',
 'PhysicalActivityQuestionnaire',
 'Smoking',
 'UrineBiochemistry']

In [None]:

['BloodBiochemistry',
 'CognitiveFluidIntelligence',
 'CognitiveMatrixPatternCompletion',
 'CognitiveNumericMemory',
 'CognitivePairedAssociativeLearning',
 'CognitivePairsMatching',
 'CognitiveProspectiveMemory',
 'CognitiveReactionTime',
 'CognitiveSymbolDigitSubstitution',
 'CognitiveTowerRearranging',
 'CognitiveTrailMaking',
 'PhysicalActivityQuestionnaire',
 'Smoking',
 'UrineBiochemistry']