In [1]:
import numpy as np
import pandas as pd
from scipy import stats
import sys
import argparse
import h5py
from scipy.stats import t as student_t
from statsmodels.stats import multitest as mt
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

In [2]:
# IDP_df = pd.read_pickle('male_IDP.pkl')
vars_df = pd.read_pickle('male_vars.pkl')
# IDP_names = np.loadtxt("IDP_names.txt", dtype=str, delimiter='\n')
# IDP_categories = np.loadtxt("IDP_categories.txt", dtype=str, delimiter='\n')
with open('varsHeader.txt') as f:
    varsHeader = f.readlines()
    varsHeader = [l.strip('\n\r') for l in varsHeader]
    varsHeader = np.array(varsHeader)
vars_categories = np.loadtxt("vars_categories.txt", dtype=str, delimiter='\n')

In [3]:
vars_df.head()

Unnamed: 0,modality,bonf,fdr,dataframe
0,T1_nonlinear,5.443607,3.776401,idx ...
1,T1_linear,5.443607,3.806226,idx ...
2,jacobian,5.443607,4.163317,idx ...
3,vbm,5.443607,3.691612,idx ...
4,T2_nonlinear,5.443607,3.536963,idx ...


In [4]:
vars_df.iloc[0].dataframe.head()

Unnamed: 0,idx,names,Categories,pearson_r,t_test_statistic,p_values_corrected,p_values,abs_pearson_r,log_p_values,log_p_values_corrected
0,0,Ethnic background (0.0),Ethnic Background,-0.008956,0.590713,1.0,0.554743,0.008956,0.255908,-0.0
1,1,Ethnic background (1.0),Ethnic Background,-0.056774,1.564572,0.788011,0.118101,0.056774,0.927745,0.103468
2,2,Ethnic background (2.0),Ethnic Background,0.016113,0.585492,1.0,0.558317,0.016113,0.253119,-0.0
3,3,Genotype measurement batch (0.0),Genetic Markers,-0.014051,0.918442,0.989835,0.358439,0.014051,0.445584,0.004437
4,4,Heterozygosity (0.0),Genetic Markers,-0.004644,0.303536,1.0,0.761496,0.004644,0.118332,-0.0


In [5]:
vars_over_bThr_list = []

for idx in range(len(vars_df)):
    df = vars_df.iloc[idx].dataframe
    bThr = vars_df.iloc[idx].bonf
    vars_over_bThr = df[df.log_p_values>=bThr].sort_values('log_p_values', ascending=False).names.to_list()
    for var in vars_over_bThr:
        vars_over_bThr_list.append(var)

In [6]:
print("Number of unique vars across all modalities: ", len(set(vars_over_bThr_list)))

Number of unique vars across all modalities:  161


In [7]:
unique_vars_over_bTHr = list(set(vars_over_bThr_list))

In [8]:
unique_vars_over_bThr_categories = []

for var in unique_vars_over_bTHr:
    idx = list(varsHeader).index(var)
    category = vars_categories[idx]
    unique_vars_over_bThr_categories.append(category)

In [9]:
sorted(unique_vars_over_bTHr)

['Absence of notch position in the pulse waveform (2.0)',
 'Alcohol (1.0)',
 'Alcohol (2.0)',
 'Alcohol consumed (1.0)',
 'Alcohol intake frequency. (0.0)',
 'Alcohol intake frequency. (2.0)',
 'Amount of alcohol drunk on a typical drinking day (0.0)',
 'Apolipoprotein A (0.0)',
 'Arm BMD (bone mineral density) (left) (2.0)',
 'Arm BMD (bone mineral density) (right) (2.0)',
 'Arms BMC (bone mineral content) (2.0)',
 'Arms BMD (bone mineral density) (2.0)',
 'Average heart rate (2.0)',
 'Average weekly beer plus cider intake (2.0)',
 'Cardiac index during PWA (2.0)',
 'Cardiac index during PWA (2.1)',
 'Cardiac output during PWA (2.0)',
 'Cardiac output during PWA (2.1)',
 'Central pulse pressure during PWA (2.0)',
 'Central systolic blood pressure during PWA (2.0)',
 'Cereal intake (0.0)',
 'Current tobacco smoking (0.0)',
 'Current tobacco smoking (2.0)',
 'Diabetes diagnosed by doctor (0.0)',
 'Diabetes diagnosed by doctor (2.0)',
 'Diagnoses - ICD10 (E119 - E11.9 Without complicatio

In [10]:
modalities = []
pearson_rs_list = []
for var in unique_vars_over_bTHr:
    modalities_with_var = []
    pearsonRs_with_var = []
    for idx in range(len(vars_df)):
        df = vars_df.iloc[idx].dataframe
        modality = vars_df.iloc[idx].modality
        bThr = vars_df.iloc[idx].bonf
        df = df[df.log_p_values>=bThr].sort_values('log_p_values', ascending=False)
        vars_over_bThr = df.names.to_list()
        pearson_r_over_bThr = df.pearson_r.to_list()
        if var in vars_over_bThr:
            modalities_with_var.append(modality)
            pearsonRs_with_var.append(pearson_r_over_bThr[vars_over_bThr.index(var)])
    modalities.append(modalities_with_var)
    pearson_rs_list.append(pearsonRs_with_var)

In [11]:
len(modalities)

161

In [12]:
df_vars_over_bThr = pd.DataFrame.from_dict({
    'names': unique_vars_over_bTHr,
    'Categories': unique_vars_over_bThr_categories,
    'modalities': modalities,
    'pearson_r': pearson_rs_list
})

In [13]:
df_vars_over_bThr.head()

Unnamed: 0,names,Categories,modalities,pearson_r
0,Diagnoses - main ICD10 (G35 - G35 Multiple scl...,Medical History,"[vbm, T2_lesions, tbss_L1]","[0.07133945495917554, 0.07753983273145704, 0.0..."
1,Femur troch BMD (bone mineral density) T-score...,Skeletal Measurements,"[rsfmri_9, rsfmri_15, rsfmri_21, rsfmri_24]","[-0.07357089286026602, -0.08553798373398441, -..."
2,Hand grip strength (left) (0.0),Physical Measurements,[rsfmri_2],[-0.07434458403079451]
3,Frequency of consuming six or more units of al...,Alcohol,[tbss_MD],[0.0936346112760032]
4,Diagnoses - ICD10 (E119 - E11.9 Without compli...,Medical History,"[vbm, T2_nonlinear, tbss_ICVF]","[0.08438892745293108, 0.07413818565389886, 0.0..."


In [14]:
df_vars_over_bThr[df_vars_over_bThr.names=='Arm fat mass (right) (0.0)']

Unnamed: 0,names,Categories,modalities,pearson_r


In [15]:
set(df_vars_over_bThr.Categories.to_list())

{'Alcohol',
 'Blood Assays',
 'Cardiac & Circulartory Measurements',
 'Cognitive Tests',
 'Diet',
 'Medical History',
 'Physical Measurements',
 'Skeletal Measurements',
 'Tobacco'}

In [16]:
for cat in sorted(list(set(df_vars_over_bThr.Categories.to_list()))):
    print("Category: {}".format(cat))
    print('------------------------------------------------')
    names = df_vars_over_bThr[df_vars_over_bThr.Categories == cat].names.to_list()
    for name in sorted(names):
        print('Var: {}'.format(name))
        mods = df_vars_over_bThr[(df_vars_over_bThr.Categories == cat) & (df_vars_over_bThr.names == name)].modalities.to_list()
        prs = df_vars_over_bThr[(df_vars_over_bThr.Categories == cat) & (df_vars_over_bThr.names == name)].pearson_r.to_list()
        print('   Modalities:')
        for i, mod in enumerate(mods[0]):
            print('   -> {} ({:.3f})'.format(mod, prs[0][i]))
        
    print('\n')

Category: Alcohol
------------------------------------------------
Var: Alcohol consumed (1.0)
   Modalities:
   -> tbss_OD (0.122)
Var: Alcohol intake frequency. (0.0)
   Modalities:
   -> rsfmri_13 (0.073)
   -> tbss_FA_s (0.079)
   -> tbss_MO_s (0.079)
   -> tbss_ICVF (0.075)
   -> tbss_ISOVF (0.090)
   -> tbss_L1 (0.081)
   -> tbss_L2 (0.086)
   -> tbss_L3 (0.080)
   -> tbss_MD (0.094)
   -> tbss_OD (0.076)
Var: Alcohol intake frequency. (2.0)
   Modalities:
   -> tbss_FA_s (0.081)
   -> tbss_L1_s (0.079)
   -> tbss_MO_s (0.072)
   -> tbss_OD_s (0.070)
   -> tbss_ICVF (0.080)
   -> tbss_ISOVF (0.084)
   -> tbss_L1 (0.076)
   -> tbss_L2 (0.076)
   -> tbss_L3 (0.073)
   -> tbss_MD (0.086)
   -> tbss_OD (0.070)
Var: Amount of alcohol drunk on a typical drinking day (0.0)
   Modalities:
   -> tbss_ISOVF_s (0.090)
Var: Average weekly beer plus cider intake (2.0)
   Modalities:
   -> tbss_MO (0.091)
Var: Frequency of consuming six or more units of alcohol (0.0)
   Modalities:
   -> tbss_

   Modalities:
   -> vbm (0.097)
   -> T2_nonlinear (0.082)
   -> swi (0.071)
   -> rsfmri_17 (0.073)
   -> tbss_ICVF_s (0.070)
   -> tbss_ICVF (0.082)
Var: Diagnoses - secondary ICD10 (I10 - I10 Essential (primary) hypertension)
   Modalities:
   -> jacobian (0.086)
   -> vbm (0.085)
   -> T2_nonlinear (0.086)
   -> T2_lesions (0.094)
   -> swi (0.097)
   -> tracts (0.076)
   -> tbss_FA_s (0.096)
   -> tbss_ISOVF_s (0.107)
   -> tbss_L1_s (0.093)
   -> tbss_L2_s (0.100)
   -> tbss_L3_s (0.108)
   -> tbss_MD_s (0.092)
   -> tbss_OD_s (0.075)
   -> tbss_FA (0.109)
   -> tbss_ICVF (0.089)
   -> tbss_ISOVF (0.102)
   -> tbss_L1 (0.102)
   -> tbss_L2 (0.096)
   -> tbss_L3 (0.099)
   -> tbss_MD (0.103)
   -> tbss_MO (0.075)
   -> tbss_OD (0.081)
Var: Diagnoses - secondary ICD10 (K824 - K82.4 Cholesterolosis of gallbladder)
   Modalities:
   -> rsfmri_22 (0.070)
Var: Diagnoses - secondary ICD10 (Z864 - Z86.4 Personal history of psychoactive substance abuse)
   Modalities:
   -> swi (0.081)
 

   -> tfmri_c_5 (-0.073)
Var: Whole body fat-free mass (0.0)
   Modalities:
   -> rsfmri_2 (-0.074)
Var: Whole body water mass (2.0)
   Modalities:
   -> rsfmri_2 (-0.072)


Category: Skeletal Measurements
------------------------------------------------
Var: Arm BMD (bone mineral density) (left) (2.0)
   Modalities:
   -> rsfmri_2 (-0.126)
Var: Arm BMD (bone mineral density) (right) (2.0)
   Modalities:
   -> rsfmri_2 (-0.137)
Var: Arms BMC (bone mineral content) (2.0)
   Modalities:
   -> rsfmri_2 (-0.091)
   -> rsfmri_9 (-0.080)
   -> rsfmri_12 (-0.083)
   -> rsfmri_13 (-0.075)
   -> rsfmri_15 (-0.080)
   -> rsfmri_16 (-0.076)
   -> rsfmri_17 (-0.086)
   -> rsfmri_18 (-0.074)
   -> rsfmri_19 (-0.079)
   -> rsfmri_24 (-0.085)
Var: Arms BMD (bone mineral density) (2.0)
   Modalities:
   -> rsfmri_2 (-0.077)
   -> rsfmri_15 (-0.082)
   -> rsfmri_21 (-0.077)
   -> rsfmri_24 (-0.082)
Var: Femur neck BMD (bone mineral density) (left) (2.0)
   Modalities:
   -> rsfmri_9 (-0.075)
   -> rsfm

In [17]:
sorted(prs[0], reverse=True)

[0.08977919271356237,
 0.08576788588645898,
 0.08274892399707524,
 0.07916980304573908,
 0.07812473987044988,
 0.07585478983665053,
 0.07469148009779372,
 0.07287333187569377,
 0.07230744565775098]

In [18]:
for cat in sorted(list(set(df_vars_over_bThr.Categories.to_list()))):
    print("Category: {}".format(cat))
    print('------------------------------------------------')
    names = df_vars_over_bThr[df_vars_over_bThr.Categories == cat].names.to_list()
    for name in sorted(names):
        print('Var: {}'.format(name))
        mods = df_vars_over_bThr[(df_vars_over_bThr.Categories == cat) & (df_vars_over_bThr.names == name)].modalities.to_list()
        prs = df_vars_over_bThr[(df_vars_over_bThr.Categories == cat) & (df_vars_over_bThr.names == name)].pearson_r.to_list()
        print('   Modalities:')
        
        prs_abs = np.abs(prs).tolist()
        _, mods = zip(*sorted(zip(prs_abs[0], mods[0]), reverse=True))
        _, prs = zip(*sorted(zip(prs_abs[0], prs[0]), reverse=True))
        
        for i, mod in enumerate(mods):
            print('   -> {} ({:.3f})'.format(mod, prs[i]))
        
    print('\n')

Category: Alcohol
------------------------------------------------
Var: Alcohol consumed (1.0)
   Modalities:
   -> tbss_OD (0.122)
Var: Alcohol intake frequency. (0.0)
   Modalities:
   -> tbss_MD (0.094)
   -> tbss_ISOVF (0.090)
   -> tbss_L2 (0.086)
   -> tbss_L1 (0.081)
   -> tbss_L3 (0.080)
   -> tbss_FA_s (0.079)
   -> tbss_MO_s (0.079)
   -> tbss_OD (0.076)
   -> tbss_ICVF (0.075)
   -> rsfmri_13 (0.073)
Var: Alcohol intake frequency. (2.0)
   Modalities:
   -> tbss_MD (0.086)
   -> tbss_ISOVF (0.084)
   -> tbss_FA_s (0.081)
   -> tbss_ICVF (0.080)
   -> tbss_L1_s (0.079)
   -> tbss_L1 (0.076)
   -> tbss_L2 (0.076)
   -> tbss_L3 (0.073)
   -> tbss_MO_s (0.072)
   -> tbss_OD_s (0.070)
   -> tbss_OD (0.070)
Var: Amount of alcohol drunk on a typical drinking day (0.0)
   Modalities:
   -> tbss_ISOVF_s (0.090)
Var: Average weekly beer plus cider intake (2.0)
   Modalities:
   -> tbss_MO (0.091)
Var: Frequency of consuming six or more units of alcohol (0.0)
   Modalities:
   -> tbss_

   -> tbss_FA_s (0.090)
   -> tbss_MD_s (0.090)
   -> T2_lesions (0.085)
   -> tbss_ICVF (0.085)
   -> swi (0.084)
   -> tbss_OD (0.083)
   -> jacobian (0.082)
   -> T2_nonlinear (0.080)
   -> vbm (0.079)
   -> tracts (0.073)
Var: Diagnoses - ICD10 (Z864 - Z86.4 Personal history of psychoactive substance abuse)
   Modalities:
   -> tbss_MO (0.080)
   -> swi (0.078)
   -> tbss_L1_s (0.077)
   -> tbss_L2_s (0.070)
Var: Diagnoses - main ICD10 (G35 - G35 Multiple sclerosis)
   Modalities:
   -> T2_lesions (0.078)
   -> tbss_L1 (0.075)
   -> vbm (0.071)
Var: Diagnoses - secondary ICD10 (E119 - E11.9 Without complications)
   Modalities:
   -> vbm (0.097)
   -> tbss_ICVF (0.082)
   -> T2_nonlinear (0.082)
   -> rsfmri_17 (0.073)
   -> swi (0.071)
   -> tbss_ICVF_s (0.070)
Var: Diagnoses - secondary ICD10 (I10 - I10 Essential (primary) hypertension)
   Modalities:
   -> tbss_FA (0.109)
   -> tbss_L3_s (0.108)
   -> tbss_ISOVF_s (0.107)
   -> tbss_MD (0.103)
   -> tbss_ISOVF (0.102)
   -> tbss

   Modalities:
   -> rsfmri_24 (-0.083)
   -> rsfmri_17 (-0.078)
   -> rsfmri_9 (-0.075)
Var: Femur neck BMD (bone mineral density) T-score (left) (2.0)
   Modalities:
   -> rsfmri_24 (-0.080)
   -> rsfmri_13 (-0.077)
   -> rsfmri_17 (-0.077)
   -> rsfmri_9 (-0.076)
   -> rsfmri_16 (-0.074)
Var: Femur neck BMD (bone mineral density) T-score (right) (2.0)
   Modalities:
   -> rsfmri_24 (-0.084)
   -> rsfmri_17 (-0.080)
   -> rsfmri_15 (-0.077)
Var: Femur shaft BMD (bone mineral density) (left) (2.0)
   Modalities:
   -> rsfmri_9 (-0.080)
   -> rsfmri_17 (-0.078)
   -> rsfmri_24 (-0.075)
   -> rsfmri_21 (-0.074)
Var: Femur shaft BMD (bone mineral density) (right) (2.0)
   Modalities:
   -> rsfmri_17 (-0.080)
   -> rsfmri_13 (-0.076)
   -> rsfmri_24 (-0.076)
   -> rsfmri_10 (-0.075)
Var: Femur total BMD (bone mineral density) (left) (2.0)
   Modalities:
   -> rsfmri_24 (-0.087)
   -> rsfmri_21 (-0.082)
   -> rsfmri_15 (-0.081)
   -> rsfmri_9 (-0.080)
   -> rsfmri_17 (-0.079)
Var: Femur to

In [23]:
variance_threshold = 0.1

for cat in sorted(list(set(df_vars_over_bThr.Categories.to_list()))):
    print("Category: {}".format(cat))
    print('------------------------------------------------')
    names = df_vars_over_bThr[df_vars_over_bThr.Categories == cat].names.to_list()
    for name in sorted(names):
        
        mods = df_vars_over_bThr[(df_vars_over_bThr.Categories == cat) & (df_vars_over_bThr.names == name)].modalities.to_list()
        prs = df_vars_over_bThr[(df_vars_over_bThr.Categories == cat) & (df_vars_over_bThr.names == name)].pearson_r.to_list()
        
        
        prs_abs = np.abs(prs).tolist()
        _, mods = zip(*sorted(zip(prs_abs[0], mods[0]), reverse=True))
        _, prs = zip(*sorted(zip(prs_abs[0], prs[0]), reverse=True))
        
        if prs[0] >= variance_threshold or abs(prs[-1])>= variance_threshold:
            print('Var: {}'.format(name))
            print('   Modalities:')
            for i, mod in enumerate(mods):
                if prs[i] >= variance_threshold or abs(prs[i])>= variance_threshold:
                    print('   -> {} ({:.3f})'.format(mod, prs[i]))
        
    print('\n')

Category: Alcohol
------------------------------------------------
Var: Alcohol consumed (1.0)
   Modalities:
   -> tbss_OD (0.122)


Category: Blood Assays
------------------------------------------------


Category: Cardiac & Circulartory Measurements
------------------------------------------------
Var: Cardiac index during PWA (2.0)
   Modalities:
   -> T2_lesions (0.112)
   -> T2_nonlinear (0.101)
Var: Cardiac index during PWA (2.1)
   Modalities:
   -> tbss_L2 (0.100)
Var: Cardiac output during PWA (2.0)
   Modalities:
   -> T2_lesions (0.101)
Var: Central systolic blood pressure during PWA (2.0)
   Modalities:
   -> T2_lesions (0.111)
   -> rsfmri_22 (0.104)
   -> tbss_MD (0.101)
Var: Number of trend entries (1.0)
   Modalities:
   -> rsfmri_9 (-0.172)
Var: Systolic blood pressure, automated reading (0.1)
   Modalities:
   -> tbss_L3_s (0.110)
   -> tbss_ISOVF_s (0.102)
   -> tbss_ISOVF (0.102)
   -> T2_lesions (0.102)
Var: Systolic brachial blood pressure (2.0)
   Modalities:
 

In [20]:
mods

('tbss_ISOVF',
 'tbss_L2_s',
 'tbss_MO',
 'tbss_ICVF',
 'tbss_L1',
 'tbss_L2',
 'tbss_L3',
 'tbss_MO_s',
 'tbss_FA')

In [21]:
counter = 0
for cat in sorted(list(set(df_vars_over_bThr.Categories.to_list()))):
    print("Category: {}".format(cat))
    print('------------------------------------------------')
    names = df_vars_over_bThr[df_vars_over_bThr.Categories == cat].names.to_list()
    for name in sorted(names):
        mods = df_vars_over_bThr[(df_vars_over_bThr.Categories == cat) & (df_vars_over_bThr.names == name)].modalities.to_list()[0]
        if 'T1_nonlinear' in mods or 'T1_linear' in mods or 'T2_nonlinear' in mods:
            pass
        else:
            print('Var: {}'.format(name))
            counter += 1
    print('\n')
    
print("There are {} vars independent from the conventional modalities".format(counter))

Category: Alcohol
------------------------------------------------
Var: Alcohol consumed (1.0)
Var: Alcohol intake frequency. (0.0)
Var: Alcohol intake frequency. (2.0)
Var: Amount of alcohol drunk on a typical drinking day (0.0)
Var: Average weekly beer plus cider intake (2.0)
Var: Frequency of consuming six or more units of alcohol (0.0)
Var: Frequency of drinking alcohol (0.0)


Category: Blood Assays
------------------------------------------------
Var: Apolipoprotein A (0.0)
Var: Gamma glutamyltransferase (0.0)
Var: Glucose (0.0)
Var: HDL cholesterol (0.0)
Var: Mean corpuscular haemoglobin (0.0)
Var: Mean corpuscular volume (0.0)
Var: Mean sphered cell volume (0.0)
Var: Total protein (0.0)


Category: Cardiac & Circulartory Measurements
------------------------------------------------
Var: Absence of notch position in the pulse waveform (2.0)
Var: Average heart rate (2.0)
Var: Diastolic blood pressure, automated reading (0.0)
Var: Diastolic blood pressure, automated reading (0.1)


In [22]:
counter = 0
for cat in sorted(list(set(df_vars_over_bThr.Categories.to_list()))):
    print("Category: {}".format(cat))
    print('------------------------------------------------')
    names = df_vars_over_bThr[df_vars_over_bThr.Categories == cat].names.to_list()
    for name in sorted(names):
        mods = df_vars_over_bThr[(df_vars_over_bThr.Categories == cat) & (df_vars_over_bThr.names == name)].modalities.to_list()[0]
        if 'T1_nonlinear' in mods or 'T1_linear' in mods or 'T2_nonlinear' in mods:
            print('Var: {}'.format(name))
            counter += 1
        else:
            pass
    print('\n')
    
print("There are {} vars in the conventional modalities".format(counter))

Category: Alcohol
------------------------------------------------


Category: Blood Assays
------------------------------------------------
Var: Glycated haemoglobin (HbA1c) (0.0)
Var: IGF-1 (0.0)


Category: Cardiac & Circulartory Measurements
------------------------------------------------
Var: Cardiac index during PWA (2.0)
Var: Cardiac index during PWA (2.1)
Var: Cardiac output during PWA (2.0)
Var: Cardiac output during PWA (2.1)
Var: Central pulse pressure during PWA (2.0)
Var: Central systolic blood pressure during PWA (2.0)
Var: Peripheral pulse pressure during PWA (2.0)
Var: Systolic blood pressure, automated reading (0.0)
Var: Systolic blood pressure, automated reading (0.1)
Var: Systolic brachial blood pressure (2.0)


Category: Cognitive Tests
------------------------------------------------


Category: Diet
------------------------------------------------


Category: Medical History
------------------------------------------------
Var: Diabetes diagnosed by doctor (0.0)
