Data have been downloaded here : <a href="https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0209353">Plos One - World largest Study</a><br>
From data available only tables :
<ul>
    <li>S8 (baseline : metadata)</li>
    <li>S10 (blood cells parameters)</li>
    <li>S11 (blood parameters)</li>
    <li>S12 (weight, blood pressure, well being, ketones)</li>
    <li>S15 (lipid and glycemia parameters)</li>
</ul>
Were downloaded, a little pre-processing was done directly in Excel :
<ul>
    <li>keeping only the id and parameters, except for the metadata</li>
    <li>removing header and footer lines which were notes, from each file</li>
    <li>renaming the columns "lenght of fast (days)" by "length_of_fast"</li>
</ul>



In [4]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go

# Load the data
blood_cell           = pd.read_excel('blood_cell.xlsx')
blood_param          = pd.read_excel('blood_param.xlsx')
lipid_glucose        = pd.read_excel('lipid_glucose.xlsx')
metadata             = pd.read_excel('metadata.xlsx')
weight_sbp_wb_ketone = pd.read_excel('weight_sbp_wb_ketones.xlsx')

# Merge the dataframes on 'id' column
df_merge = (metadata
            .merge(weight_sbp_wb_ketone, on='id')
            .merge(lipid_glucose, on='id')
            .merge(blood_cell, on='id')
            .merge(blood_param, on='id'))


In [None]:
df0.query("age.between(@age)").index

In [3]:
set([1,2,3,3]).intersection(set([3,4,5]))

{3}

##### Cleaning : 
<ul>
    <li>transform the dataframe to a long format adding a column 'timepoint'</li>
    <li>cast columns to the right format replacing "<" by "" or "," by "."</li>
    <li>replace strong outliers by missing values</li>
</ul>

In [5]:
# delete the extra spaces in the column names
df_merge.columns = df_merge.columns.str.replace('  ', ' ')

# Get the columns ending with "pre" or "post"
columns_to_melt = df_merge.filter(regex='pre$|post$').columns
# Get metadata columns
id_vars_col = metadata.columns.tolist()

# Melt the dataframe
df_long = pd.melt(df_merge, 
                  id_vars=id_vars_col, 
                  value_vars=columns_to_melt, 
                  var_name='parameter')

## Extract the timepoint based on the suffix of the parameter column
df_long['timepoint'] = (df_long['parameter'].str
                        .endswith('post').astype(int))

## Extract the value name based on what comes before "pre" or "post"
df_long['parameter'] = (df_long['parameter'].str
                        .replace(r'\s*(pre|post)$', '', regex=True))

# Rename the value column in order to have like a categorical variable
df_long.timepoint = (df_long["timepoint"].astype(str)
                     .replace({"0":"pre","1":"post"}))

# Replace the "<" by "" in the value column to cast the columns as numeric
df_long.loc[:,"value"] = (df_long["value"].astype(str)
                          .replace({"<": "", ",":"."}, regex=True))
df_long["value"] = pd.to_numeric(df_long["value"], errors="coerce")

# replace strong outliers by np.nan, defined as values that are >= 5 std 
# from the mean of each group (parameter, timepoint)
df_long['value'] = (df_long
                    .groupby(['parameter', 'timepoint'])['value']
                    .transform(
    lambda x : x.mask(np.abs(((x - x.mean()) / x.std())).ge(5), np.nan)
    ))


In [6]:
df_raw = pd.read_parquet('merged_data_wide.parquet')
df = df_raw.reset_index(drop = True).set_index("id").copy()
df0 =df.query("timepoint.eq(0)").copy()


In [19]:
age = [18,50]
index = set(df0[df0["age (years)"].between(age[0], age[1])].index)
a = {5,6,8,20,23,28}.intersection(index)
a

{5, 6, 8, 20, 23, 28}

##### Addition of informations in order to prepare the analysis

In [16]:
# addition of a numeric column indicating exactly the timepoint.
df_long["timeline"] = np.where(df_long["timepoint"] == "pre", 0, df_long["fasting duration (days)"])

df_long.sort_values(["parameter", "id", "timeline"], inplace=True)
df_long["value_change"] = (df_long
                           .groupby(["id", "parameter"])["value"]
                           .transform("diff"))
# we need to have the change in all rows of ('id', 'parameter') groups for the following steps
# using .bfill() or .ffill() introduce errors in the data, we prefer to use "first" in a transform function.
df_long["value_change"] = df_long.groupby(["id", "parameter"])["value_change"].transform("first")

# it still some outliers for the change, after having a look on it, 
# we decided to remove the values that are >= 8 std from the mean of each group (parameter, timepoint)
outlier_change_mask = (
    df_long
    .groupby(['parameter', 'timepoint'])['value_change']
    .transform(lambda x: np.abs((x - x.mean()) / x.std()).ge(7))
    )
df_long.loc[outlier_change_mask, 'value_change'] = np.nan
df_long.loc[outlier_change_mask, 'value'] = np.nan



In [17]:
df_wide = df_long.pivot_table(index=['id', 'timepoint', 'sex', 'fasting duration (days)', 'age (years)'], columns="parameter", values=["value", "value_change"], ).reset_index()
df_wide.columns = [" ".join(col) for col in df_wide.columns.values]
df_wide.columns = [col[6:] if col.startswith("value ") else col for col in df_wide.columns.values]
df_wide.columns = [col[13:] + " change" if col.startswith("value_change") else col for col in df_wide.columns.values]
df_wide.columns = [col.strip() for col in df_wide.columns.values]
df_wide["timepoint"] = df_wide["timepoint"].replace({"pre":0, "post":1})

# it will be usefill for having a jittered x axis on timepoint.
df_wide['jittered_x'] = (
    df_wide['timepoint']
    .apply(lambda x : x + round(np.random.uniform(-0.1, 0.1),2))
)
df_wide.sort_values(by=['timepoint','id'], inplace=True)


##### Export the cleaned dataframe to an Excel file

In [18]:
df_long.to_parquet('merged_data_long.parquet', index=False)
df_wide.to_parquet('merged_data_wide.parquet', index=False)

#### draw the plot of correlation exploration 

In [19]:
correlation_matrix = pd.read_excel("correlation_matrix.xlsx", index_col=0)


In [20]:
list_of_param = correlation_matrix.columns.to_list()
param = list_of_param[0]
filtered_matrix = correlation_matrix.loc[:,param].reset_index().set_index("index").fillna(0)
list_of_param =  filtered_matrix.index.to_list()
filtered_matrix.sort_values(by = param,inplace=True)
# fig = go.Figure(data=go.Heatmap(
#             z=filtered_matrix.values,
#             y=filtered_matrix.index,
#             x=filtered_matrix.columns,
#             colorscale='RdBu',
#             colorbar=dict(title='', orientation="v", thickness=15,tickfont=dict(size=12)),
#             zmin = -1, zmax = 1,
#             hoverongaps=False,
#             hoverinfo='none',  # Disable hovering
#             text=[[name] for name in list(filtered_matrix.index)],
#             texttemplate="%{text}",
#             textfont={"size":10},
#         ))
# fig.update_layout(
#         margin=dict(l=20, r=20, t=50, b=0),
#         yaxis=dict(title="", showticklabels=False),
#         xaxis=dict(title="", showticklabels=False),
#         width=300,
#         height=1100,
#         hovermode="x",
#     )
# fig.update_xaxes(title=f"Correlation : {param}")
# print(fig.layout.xaxis.hoverformat)

In [21]:
ref = pd.DataFrame({"index" :["ref1", "ref_neg1"], "value": [1,-1]}).set_index("index")

In [22]:
values = list(filtered_matrix.iloc[:,0].values)


In [6]:
correlation_matrix = pd.read_excel("correlation_matrix.xlsx", index_col=0)
json_matrix = [correlation_matrix.to_dict()]
filtered_data_dict = {key: round(value,2) for key, value in json_matrix[0].get("baseline of the parameter").items() if not str(value) =="nan"}
sorted_data_dict = dict(sorted(filtered_data_dict.items(), key=lambda item: abs(item[1]), reverse=True))


In [9]:
len(sorted_data_dict.keys())

41

In [139]:


param = 'baseline of the parameter'
filtered_matrix = correlation_matrix.loc[:,param].reset_index().set_index("index").fillna(0)
list_of_param =  filtered_matrix.index.to_list()
filtered_matrix.sort_values(by = param,inplace=True)

hours = [param]
days = list_of_param
data = list(filtered_matrix.iloc[:,0].values)

options = {
    'tooltip': {
        'position': 'top',
    },
    'grid': {
        'height': '50%',
        'top': '10%'
    },
    'xAxis': {
        'type': 'category',
        'data': [param],
        'splitArea': {
            'show': True
        }
    },
    'yAxis': {
        'show': False,
    },
    'visualMap': {
        'min': 0,
        'max': 10,
        'calculable': True,
        'orient': 'horizontal',
        'left': 'center',
        'bottom': '25%'
    },
    'series': [
        {
            'name': 'Punch Card',
            'type': 'heatmap',
            'data': [[0, i, d] for i, d in enumerate(data)],
            'label': {
                'show': True,
                'position': 'inside',
            },
            'emphasis': {
                'itemStyle': {
                    'shadowBlur': 10,
                    'shadowColor': 'rgba(0, 0, 0, 0.5)'
                }
            }
        }
    ]
}
options['series'][0]['data'] = [[0, i, d] for i, d in enumerate(data)]

[[0, 0, -0.8991413404613746],
 [0, 1, -0.8515919761969845],
 [0, 2, -0.6558393097409405],
 [0, 3, -0.6525243285992586],
 [0, 4, -0.6498552500383081],
 [0, 5, -0.6213175249306229],
 [0, 6, -0.5952905061690204],
 [0, 7, -0.5733215536568387],
 [0, 8, -0.57083442689596],
 [0, 9, -0.5278778220222644],
 [0, 10, -0.5041531234010463],
 [0, 11, -0.4853838881793094],
 [0, 12, -0.4810645415272526],
 [0, 13, -0.4709228715750567],
 [0, 14, -0.4569667637006651],
 [0, 15, -0.4367197786930928],
 [0, 16, -0.4360541648249073],
 [0, 17, -0.4205220162706504],
 [0, 18, -0.4145689886180869],
 [0, 19, -0.4130390807728215],
 [0, 20, -0.4064701066488774],
 [0, 21, -0.4033050373125016],
 [0, 22, -0.3873264619774134],
 [0, 23, -0.364329816035065],
 [0, 24, -0.3295898762878696],
 [0, 25, -0.3256605992658447],
 [0, 26, -0.2985249529862553],
 [0, 27, -0.2840026766471113],
 [0, 28, -0.2614921946477595],
 [0, 29, -0.2576834764839997],
 [0, 30, -0.2527938448905704],
 [0, 31, -0.2494749645808349],
 [0, 32, -0.235362685