Data have been downloaded here : <a href="https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0209353">Plos One - World largest Study</a><br>
From data available only tables :
<ul>
    <li>S8 (baseline : metadata)</li>
    <li>S10 (blood cells parameters)</li>
    <li>S11 (blood parameters)</li>
    <li>S12 (weight, blood pressure, well being, ketones)</li>
    <li>S15 (lipid and glycemia parameters)</li>
</ul>
Were downloaded, a little pre-processing was done directly in Excel :
<ul>
    <li>keeping only the id and parameters, except for the metadata</li>
    <li>removing header and footer lines which were notes, from each file</li>
    <li>renaming the columns "lenght of fast (days)" by "length_of_fast"</li>
</ul>



In [1]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go

# Load the data
blood_cell           = pd.read_excel('blood_cell.xlsx')
blood_param          = pd.read_excel('blood_param.xlsx')
lipid_glucose        = pd.read_excel('lipid_glucose.xlsx')
metadata             = pd.read_excel('metadata.xlsx')
weight_sbp_wb_ketone = pd.read_excel('weight_sbp_wb_ketones.xlsx')

# Merge the dataframes on 'id' column
df_merge = (metadata
            .merge(weight_sbp_wb_ketone, on='id')
            .merge(lipid_glucose, on='id')
            .merge(blood_cell, on='id')
            .merge(blood_param, on='id')
)


##### Cleaning : 
<ul>
    <li>transform the dataframe to a long format adding a column 'timepoint'</li>
    <li>cast columns to the right format replacing "<" by "" or "," by "."</li>
    <li>replace strong outliers by missing values</li>
</ul>

In [2]:
# delete the extra spaces in the column names
df_merge.columns = df_merge.columns.str.replace('  ', ' ')

# Get the columns ending with "pre" or "post"
columns_to_melt = df_merge.filter(regex='pre$|post$').columns
# Get metadata columns
id_vars_col = metadata.columns.difference(['age (years)', 'fasting duration (days)', 'sex'])
df_merge.sex.replace({"M":1, "F":0}, inplace=True)

df_long = pd.melt(df_merge, 
                  id_vars=id_vars_col, 
                #   value_vars=columns_to_melt, 
                  var_name='parameter')

## Extract the timepoint based on the suffix of the parameter column
df_long['timepoint'] = (df_long['parameter'].str
                        .endswith('post').astype(int))

## Extract the value name based on what comes before "pre" or "post"
df_long['parameter'] = (df_long['parameter'].str
                        .replace(r'\s*(pre|post)$', '', regex=True))

# Rename the value column in order to have like a categorical variable
# df_long.timepoint = (df_long["timepoint"].astype(str)
#                      .replace({"0":0,"1":1}))

# Replace the "<" by "" in the value column to cast the columns as numeric
df_long.loc[:,"value"] = (df_long["value"].astype(str)
                          .replace({"<": "", ",":"."}, regex=True))
df_long["value"] = pd.to_numeric(df_long["value"], errors="coerce")

# replace strong outliers by np.nan, defined as values that are >= 5 std 
# from the mean of each group (parameter, timepoint)
df_long['value'] = (df_long
                    .groupby(['parameter', 'timepoint'])['value']
                    .transform(
    lambda x : x.mask(np.abs(((x - x.mean()) / x.std())).ge(5), np.nan)
    ))


In [149]:
df_raw = pd.read_parquet('merged_data_wide.parquet')
df = df_raw.reset_index(drop = True).set_index("id").copy()
df0 =df.query("timepoint.eq(0)").copy()


##### Addition of informations in order to prepare the analysis

In [3]:
# addition of a numeric column indicating exactly the timepoint.
# df_long["timeline"] = np.where(df_long["timepoint"] == "pre", 0, df_long["fasting duration (days)"])

df_long.sort_values(["parameter", "id", "timepoint"], inplace=True)
df_long["value_change"] = (df_long
                           .groupby(["id", "parameter"])["value"]
                           .transform("diff"))
# we need to have the change in all rows of ('id', 'parameter') groups for the following steps
# using .bfill() or .ffill() introduce errors in the data, we prefer to use "first" in a transform function.
df_long["value_change"] = df_long.groupby(["id", "parameter"])["value_change"].transform("first")

# it still some outliers for the change, after having a look on it, 
# we decided to remove the values that are >= 8 std from the mean of each group (parameter, timepoint)
outlier_change_mask = (
    df_long
    .groupby(['parameter', 'timepoint'])['value_change']
    .transform(lambda x: np.abs((x - x.mean()) / x.std()).ge(7))
    )
df_long.loc[outlier_change_mask, 'value_change'] = np.nan
df_long.loc[outlier_change_mask, 'value'] = np.nan



In [4]:
df_wide = df_long.pivot_table(index=['id', 'timepoint'], columns="parameter", values=["value", "value_change"], ).reset_index()
df_wide.columns = [" ".join(col) for col in df_wide.columns.values]
df_wide.columns = [col[6:] if col.startswith("value ") else col for col in df_wide.columns.values]
df_wide.columns = [col[13:] + " change" if col.startswith("value_change") else col for col in df_wide.columns.values]
df_wide.columns = [col.strip() for col in df_wide.columns.values]
df_wide["timepoint"] = df_wide["timepoint"].replace({"pre":0, "post":1})

# it will be usefill for having a jittered x axis on timepoint.
df_wide['jittered_x'] = (
    df_wide['timepoint']
    .apply(lambda x : x + round(np.random.uniform(-0.1, 0.1),2))
)
df_wide.sort_values(by=['timepoint','id'], inplace=True)


##### Export the cleaned dataframe to an Excel file

In [104]:
# df_long.to_parquet('merged_data_long.parquet', index=False)
# df_wide.to_parquet('merged_data_wide.parquet', index=False)

### Make the correlation matrix

In [262]:

def make_custom_corr_matrix(method="spearman"):
    metadata_not_to_exclude = ["fasting duration (days)", "age (years)", "sex"]
    baseline_corr = (
        df_long.query("timepoint.eq(0)")
            .groupby("parameter")[["value", "value_change"]]
            .agg("corr", method)
            .reset_index()
            .drop_duplicates(subset="parameter")
            .fillna(0)
            .drop(["level_1", "value"], axis=1)
            .rename(columns={"value_change": "baseline of the parameter"})
            .apply(lambda x : x + " change" if x.name == "parameter" else x)
            .set_index("parameter").T
        )
    changes_corr = (
        df_wide.query("timepoint.eq(0)")
            .drop(columns=["timepoint", "id", "jittered_x"] 
                    + list(set([col for col in df_wide.columns if "change" not in col])
                            .difference(metadata_not_to_exclude))
                    )
            .corr(method)
    )
    return pd.concat([changes_corr, baseline_corr]).fillna(0).iloc[:,:-len(metadata_not_to_exclude)]

corr_matrix = make_custom_corr_matrix("pearson")

In [263]:
df_graph = df_wide[['id', 'timepoint', 'fasting duration (days)','CRP (mg/l) change' , 'sex']].query("timepoint == 0")
px.scatter(df_graph, x='sex', y='CRP (mg/l) change', trendline="ols")
# df_graph[["fasting duration (days)", 'BMI (kg/m²) change']].corr(method = "spearman")

In [88]:
import dash
 
from dash import Dash, Input, Output, dcc, html, State

app = Dash(__name__)

app.layout = html.Div([
        dcc.RangeSlider(18,100, id = "age", value = [18,100]),
        dcc.Graph(id = "graph"),
    ]
    )
@app.callback(
    Output("graph", "figure"),
    Input("age", "value"),
)
def update_graph(value):
    dff = df_wide.loc[df_wide["age (years)"].between(value[0],value[1] )]
    counts = dff.sex.value_counts().to_frame().reset_index()
    # box = px.sunburst(counts, path= ["sex"], values="count")
    fig = go.Figure(go.Pie( 
        values= counts["count"], 
        labels= ["M", "F"]),
        
        
        )
    fig.update_layout(
        transition = {"duration":500 ,'easing':'cubic-in-out'},
        # pull= 0.8

    )
    
    return fig

app.run_server(port=8060)

In [35]:
counts = df_wide.sex.value_counts().to_frame().reset_index()
counts

Unnamed: 0,sex,count
0,0.0,841
1,1.0,581


In [58]:
df_wide.loc[df_wide["age (years)"].between([18,100])]

Unnamed: 0,id,timepoint,AP (µkat/l),Acetoacetic acid (mg/dL),BMI (kg/m²),CRP (mg/l),Ca (mmol/l),Creatinine (µmol/l),DBP (mmHg),ESR 1h,...,TC (mmol/l) change,TG (mmol/l) change,Thrombocytes (103/µl) change,Urea (mmol/l) change,Uric acid (µmol/l) change,glucose (mmol/l) change,puls (beats/min) change,waist (cm) change,weight (kg) change,jittered_x
1212,607,0,0.93352,0.0,31.644318,0.75,2.32,58.344,82.0,8.0,...,-1.586,0.3762,-50.0,-1.5531,386.75,-0.9435,9.0,,-8.55,-0.01


In [61]:
df_wide.dropna(subset=["age (years)"])

Unnamed: 0,id,timepoint,AP (µkat/l),Acetoacetic acid (mg/dL),BMI (kg/m²),CRP (mg/l),Ca (mmol/l),Creatinine (µmol/l),DBP (mmHg),ESR 1h,...,TC (mmol/l) change,TG (mmol/l) change,Thrombocytes (103/µl) change,Urea (mmol/l) change,Uric acid (µmol/l) change,glucose (mmol/l) change,puls (beats/min) change,waist (cm) change,weight (kg) change,jittered_x
0,1,0,1.45029,0.0,28.500000,0.89,2.40,70.720,80.0,8.0,...,-0.598,-0.7524,-34.0,-0.8684,190.40,-1.1100,8.0,-7.0,-5.2,-0.03
2,2,0,1.18357,0.0,23.862396,0.21,2.46,58.344,67.0,8.0,...,-1.534,0.6498,-30.0,-0.0501,71.40,-1.0545,6.0,,-3.5,-0.01
4,3,0,1.41695,5.0,,2.00,2.36,69.836,,6.0,...,-0.754,0.4218,1.0,-2.9225,255.85,-1.2210,,-3.0,,-0.04
6,4,0,1.65033,,27.327484,0.49,2.42,76.024,102.0,2.0,...,-0.598,-0.5814,-17.0,-1.1189,113.05,-0.7770,1.0,-4.0,-2.6,-0.09
8,5,0,0.73348,0.0,21.079882,0.03,2.33,58.344,90.0,6.0,...,-0.624,0.1710,-45.0,-0.3173,107.10,-1.2210,-5.0,-3.0,-2.1,-0.07
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2834,1418,0,0.88351,,25.826446,0.20,2.37,72.488,70.0,6.0,...,-1.248,-0.5700,18.0,-1.5197,202.30,-1.2210,19.0,-4.0,-3.2,0.02
2836,1419,0,1.08355,,30.592956,7.20,2.39,60.996,80.0,16.0,...,-0.364,-2.2686,-69.0,-1.8537,172.55,0.4995,9.0,-3.0,-4.1,0.02
2838,1420,0,1.03354,,,0.17,2.45,85.748,,4.0,...,-1.534,-0.2394,11.0,-0.1670,-23.80,-1.0545,,,,0.04
2840,1421,0,1.21691,,21.083563,0.35,2.25,53.040,82.0,12.0,...,0.104,-0.0114,49.0,-1.3026,41.65,-0.7215,14.0,,-2.9,-0.06
