In [1]:
import numpy as np
import pandas as pd
from scipy import stats
import sys
import argparse
import h5py
from scipy.stats import t as student_t
from statsmodels.stats import multitest as mt
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

In [21]:
# IDP_df = pd.read_pickle('male_IDP_prototype.pkl')
# vars_df = pd.read_pickle('male_vars_prototype.pkl')

IDP_df = pd.read_pickle('female_feature_direction_ICA_deltas_IDP_deconf_short.pkl')
vars_df = pd.read_pickle('female_feature_direction_ICA_deltas_nIDP_deconf_short.pkl')

IDP_names = np.loadtxt("IDP_names.txt", dtype=str, delimiter='\n')
IDP_categories = np.loadtxt("IDP_categories.txt", dtype=str, delimiter='\n')
with open('varsHeader.txt') as f:
    varsHeader = f.readlines()
    varsHeader = [l.strip('\n\r') for l in varsHeader]
    varsHeader = np.array(varsHeader)
vars_categories = np.loadtxt("vars_categories.txt", dtype=str, delimiter='\n')

In [22]:
vars_df.head()

Unnamed: 0,modality,bonf,fdr,dataframe
0,IC = 0,5.509794,3.426634,idx ...
1,IC = 1,5.509794,3.1093,idx ...


In [23]:
vars_df.iloc[0].dataframe.head()

Unnamed: 0,idx,names,Categories,pearson_r,t_test_statistic,p_values_corrected,p_values,abs_pearson_r,log_p_values,log_p_values_corrected
0,0,Ethnic background (0.0),Ethnic Background,0.024623,1.743049,0.709915,0.081386,0.024623,1.089448,0.148794
1,1,Ethnic background (1.0),Ethnic Background,0.019358,0.559812,1.0,0.575758,0.019358,0.23976,-0.0
2,2,Ethnic background (2.0),Ethnic Background,0.01388,0.52365,1.0,0.600603,0.01388,0.221412,-0.0
3,3,Genotype measurement batch (0.0),Genetic Markers,0.04166,2.912191,0.229548,0.003605,0.04166,2.443057,0.639127
4,4,Heterozygosity (0.0),Genetic Markers,0.003603,0.251629,1.0,0.801339,0.003603,0.096184,-0.0


In [24]:
vars_over_bThr_list = []

for idx in range(len(vars_df)):
    df = vars_df.iloc[idx].dataframe
    bThr = vars_df.iloc[idx].bonf
    vars_over_bThr = df[df.log_p_values>=bThr].sort_values('log_p_values', ascending=False).names.to_list()
    for var in vars_over_bThr:
        vars_over_bThr_list.append(var)

In [25]:
print("Number of unique vars across all modalities: ", len(set(vars_over_bThr_list)))

Number of unique vars across all modalities:  157


In [26]:
unique_vars_over_bTHr = list(set(vars_over_bThr_list))

In [27]:
unique_vars_over_bThr_categories = []

for var in unique_vars_over_bTHr:
    idx = list(varsHeader).index(var)
    category = vars_categories[idx]
    unique_vars_over_bThr_categories.append(category)

In [28]:
sorted(unique_vars_over_bTHr)

['Alcohol intake frequency. (0.0)',
 'Alcohol intake frequency. (2.0)',
 'Android bone mass (2.0)',
 'Android tissue fat percentage (2.0)',
 'Android total mass (2.0)',
 'Arm BMD (bone mineral density) (left) (2.0)',
 'Arm BMD (bone mineral density) (right) (2.0)',
 'Arm fat mass (right) (0.0)',
 'Arm fat mass (right) (2.0)',
 'Arm fat percentage (left) (0.0)',
 'Arm fat percentage (left) (2.0)',
 'Arm fat percentage (right) (0.0)',
 'Arm fat percentage (right) (2.0)',
 'Arm fat-free mass (left) (0.0)',
 'Arm fat-free mass (right) (0.0)',
 'Arm fat-free mass (right) (2.0)',
 'Arm predicted mass (left) (0.0)',
 'Arm total mass (left) (2.0)',
 'Arm total mass (right) (2.0)',
 'Arms BMC (bone mineral content) (2.0)',
 'Arms BMD (bone mineral density) (2.0)',
 'Arms tissue fat percentage (2.0)',
 'Arms total mass (2.0)',
 'Average weekly red wine intake (0.0)',
 'Body fat percentage (0.0)',
 'Body fat percentage (2.0)',
 'Body mass index (BMI) (0.0)',
 'Body mass index (BMI) (2.0)',
 'Body

In [29]:
modalities = []
pearson_rs_list = []
for var in unique_vars_over_bTHr:
    modalities_with_var = []
    pearsonRs_with_var = []
    for idx in range(len(vars_df)):
        df = vars_df.iloc[idx].dataframe
        modality = vars_df.iloc[idx].modality
        bThr = vars_df.iloc[idx].bonf
        df = df[df.log_p_values>=bThr].sort_values('log_p_values', ascending=False)
        vars_over_bThr = df.names.to_list()
        pearson_r_over_bThr = df.pearson_r.to_list()
        if var in vars_over_bThr:
            modalities_with_var.append(modality)
            pearsonRs_with_var.append(pearson_r_over_bThr[vars_over_bThr.index(var)])
    modalities.append(modalities_with_var)
    pearson_rs_list.append(pearsonRs_with_var)

In [30]:
len(modalities)

157

In [31]:
df_vars_over_bThr = pd.DataFrame.from_dict({
    'names': unique_vars_over_bTHr,
    'Categories': unique_vars_over_bThr_categories,
    'modalities': modalities,
    'pearson_r': pearson_rs_list
})

In [32]:
df_vars_over_bThr.head()

Unnamed: 0,names,Categories,modalities,pearson_r
0,"Heel bone mineral density (BMD), manual entry ...",Skeletal Measurements,[IC = 1],[-0.24380500185656462]
1,Body fat percentage (2.0),Physical Measurements,[IC = 1],[-0.11665958405269349]
2,Speed of sound through heel (left) (1.0),Skeletal Measurements,[IC = 1],[-0.21397913655152848]
3,Head BMD (bone mineral density) (2.0),Skeletal Measurements,"[IC = 0, IC = 1]","[-0.23681547634479722, -0.3649833462142374]"
4,Gynoid total mass (2.0),Physical Measurements,[IC = 1],[-0.1351350250251737]


In [33]:
set(df_vars_over_bThr.Categories.to_list())

{'Alcohol',
 'Blood Assays',
 'Cardiac & Circulartory Measurements',
 'Diet',
 'Medical History',
 'Physical Measurements',
 'Skeletal Measurements'}

In [34]:
for cat in sorted(list(set(df_vars_over_bThr.Categories.to_list()))):
    print("Category: {}".format(cat))
    print('------------------------------------------------')
    names = df_vars_over_bThr[df_vars_over_bThr.Categories == cat].names.to_list()
    for name in sorted(names):
        print('Var: {}'.format(name))
        mods = df_vars_over_bThr[(df_vars_over_bThr.Categories == cat) & (df_vars_over_bThr.names == name)].modalities.to_list()
        prs = df_vars_over_bThr[(df_vars_over_bThr.Categories == cat) & (df_vars_over_bThr.names == name)].pearson_r.to_list()
        print('   Modalities:')
        for i, mod in enumerate(mods[0]):
            print('   -> {} ({:.3f})'.format(mod, prs[0][i]))
        
    print('\n')

Category: Alcohol
------------------------------------------------
Var: Alcohol intake frequency. (0.0)
   Modalities:
   -> IC = 1 (0.087)
Var: Alcohol intake frequency. (2.0)
   Modalities:
   -> IC = 1 (0.094)
Var: Average weekly red wine intake (0.0)
   Modalities:
   -> IC = 1 (0.083)
Var: Frequency of drinking alcohol (0.0)
   Modalities:
   -> IC = 1 (0.114)


Category: Blood Assays
------------------------------------------------
Var: C-reactive protein (0.0)
   Modalities:
   -> IC = 1 (-0.091)


Category: Cardiac & Circulartory Measurements
------------------------------------------------
Var: Cardiac index during PWA (2.0)
   Modalities:
   -> IC = 0 (-0.074)
Var: Cardiac output during PWA (2.0)
   Modalities:
   -> IC = 0 (-0.077)
Var: Central pulse pressure during PWA (2.0)
   Modalities:
   -> IC = 0 (-0.076)
Var: Central systolic blood pressure during PWA (2.0)
   Modalities:
   -> IC = 0 (-0.074)
Var: Peripheral pulse pressure during PWA (2.0)
   Modalities:
   -> IC = 

In [35]:
sorted(prs[0], reverse=True)

[-0.1053154210731249]

In [36]:
for cat in sorted(list(set(df_vars_over_bThr.Categories.to_list()))):
    print("Category: {}".format(cat))
    print('------------------------------------------------')
    names = df_vars_over_bThr[df_vars_over_bThr.Categories == cat].names.to_list()
    for name in sorted(names):
        print('Var: {}'.format(name))
        mods = df_vars_over_bThr[(df_vars_over_bThr.Categories == cat) & (df_vars_over_bThr.names == name)].modalities.to_list()
        prs = df_vars_over_bThr[(df_vars_over_bThr.Categories == cat) & (df_vars_over_bThr.names == name)].pearson_r.to_list()
        print('   Modalities:')
        
        prs_abs = np.abs(prs).tolist()
        _, mods = zip(*sorted(zip(prs_abs[0], mods[0]), reverse=True))
        _, prs = zip(*sorted(zip(prs_abs[0], prs[0]), reverse=True))
        
        for i, mod in enumerate(mods):
            print('   -> {} ({:.3f})'.format(mod, prs[i]))
        
    print('\n')

Category: Alcohol
------------------------------------------------
Var: Alcohol intake frequency. (0.0)
   Modalities:
   -> IC = 1 (0.087)
Var: Alcohol intake frequency. (2.0)
   Modalities:
   -> IC = 1 (0.094)
Var: Average weekly red wine intake (0.0)
   Modalities:
   -> IC = 1 (0.083)
Var: Frequency of drinking alcohol (0.0)
   Modalities:
   -> IC = 1 (0.114)


Category: Blood Assays
------------------------------------------------
Var: C-reactive protein (0.0)
   Modalities:
   -> IC = 1 (-0.091)


Category: Cardiac & Circulartory Measurements
------------------------------------------------
Var: Cardiac index during PWA (2.0)
   Modalities:
   -> IC = 0 (-0.074)
Var: Cardiac output during PWA (2.0)
   Modalities:
   -> IC = 0 (-0.077)
Var: Central pulse pressure during PWA (2.0)
   Modalities:
   -> IC = 0 (-0.076)
Var: Central systolic blood pressure during PWA (2.0)
   Modalities:
   -> IC = 0 (-0.074)
Var: Peripheral pulse pressure during PWA (2.0)
   Modalities:
   -> IC = 

In [37]:
variance_threshold = 0.1

for cat in sorted(list(set(df_vars_over_bThr.Categories.to_list()))):
    print("Category: {}".format(cat))
    print('------------------------------------------------')
    names = df_vars_over_bThr[df_vars_over_bThr.Categories == cat].names.to_list()
    for name in sorted(names):
        
        mods = df_vars_over_bThr[(df_vars_over_bThr.Categories == cat) & (df_vars_over_bThr.names == name)].modalities.to_list()
        prs = df_vars_over_bThr[(df_vars_over_bThr.Categories == cat) & (df_vars_over_bThr.names == name)].pearson_r.to_list()
        
        
        prs_abs = np.abs(prs).tolist()
        _, mods = zip(*sorted(zip(prs_abs[0], mods[0]), reverse=True))
        _, prs = zip(*sorted(zip(prs_abs[0], prs[0]), reverse=True))
        
        if prs[0] >= variance_threshold:
            print('Var: {}'.format(name))
            print('   Modalities:')
            for i, mod in enumerate(mods):
                if prs[i] >= variance_threshold:
                    print('   -> {} ({:.3f})'.format(mod, prs[i]))
        
    print('\n')

Category: Alcohol
------------------------------------------------
Var: Frequency of drinking alcohol (0.0)
   Modalities:
   -> IC = 1 (0.114)


Category: Blood Assays
------------------------------------------------


Category: Cardiac & Circulartory Measurements
------------------------------------------------


Category: Diet
------------------------------------------------


Category: Medical History
------------------------------------------------


Category: Physical Measurements
------------------------------------------------


Category: Skeletal Measurements
------------------------------------------------




In [38]:
mods

('IC = 1',)

In [39]:
counter = 0
for cat in sorted(list(set(df_vars_over_bThr.Categories.to_list()))):
    print("Category: {}".format(cat))
    print('------------------------------------------------')
    names = df_vars_over_bThr[df_vars_over_bThr.Categories == cat].names.to_list()
    for name in sorted(names):
        mods = df_vars_over_bThr[(df_vars_over_bThr.Categories == cat) & (df_vars_over_bThr.names == name)].modalities.to_list()[0]
        if 'ElasticNetFull' in mods or 'ElasticNetFull' in mods or 'ElasticNetFull' in mods:
            print('Var: {}'.format(name))
            counter += 1
        else:
            pass
    print('\n')
    
print("There are {} vars independent from the conventional modalities".format(counter))

Category: Alcohol
------------------------------------------------


Category: Blood Assays
------------------------------------------------


Category: Cardiac & Circulartory Measurements
------------------------------------------------


Category: Diet
------------------------------------------------


Category: Medical History
------------------------------------------------


Category: Physical Measurements
------------------------------------------------


Category: Skeletal Measurements
------------------------------------------------


There are 0 vars independent from the conventional modalities


In [40]:
IDP_df0 = pd.read_pickle('female_IDP.pkl')
vars_df0 = pd.read_pickle('female_vars.pkl')

In [41]:
vars0_over_bThr_list = []

for idx in range(len(vars_df0)):
    df = vars_df0.iloc[idx].dataframe
    bThr = vars_df0.iloc[idx].bonf
    vars_over_bThr = df[df.log_p_values>=bThr].sort_values('log_p_values', ascending=False).names.to_list()
    for var in vars_over_bThr:
        vars0_over_bThr_list.append(var)

In [42]:
print("Number of unique vars across all modalities: ", len(set(vars0_over_bThr_list)))

Number of unique vars across all modalities:  214


In [43]:
unique_vars0_over_bTHr = list(set(vars0_over_bThr_list))
unique_vars0_over_bThr_categories = []

for var in unique_vars0_over_bTHr:
    idx = list(varsHeader).index(var)
    category = vars_categories[idx]
    unique_vars0_over_bThr_categories.append(category)
    
modalities0 = []
pearson_rs_list0 = []
for var in unique_vars0_over_bTHr:
    modalities_with_var = []
    pearsonRs_with_var = []
    for idx in range(len(vars_df0)):
        df = vars_df0.iloc[idx].dataframe
        modality = vars_df0.iloc[idx].modality
        bThr = vars_df0.iloc[idx].bonf
        df = df[df.log_p_values>=bThr].sort_values('log_p_values', ascending=False)
        vars_over_bThr = df.names.to_list()
        pearson_r_over_bThr = df.pearson_r.to_list()
        if var in vars_over_bThr:
            modalities_with_var.append(modality)
            pearsonRs_with_var.append(pearson_r_over_bThr[vars_over_bThr.index(var)])
    modalities0.append(modalities_with_var)
    pearson_rs_list0.append(pearsonRs_with_var)

In [44]:
df_vars0_over_bThr = pd.DataFrame.from_dict({
    'names': unique_vars0_over_bTHr,
    'Categories': unique_vars0_over_bThr_categories,
    'modalities': modalities0,
    'pearson_r': pearson_rs_list0
})

In [45]:
df_vars0_over_bThr[df_vars0_over_bThr.names=='Alcohol intake frequency. (2.0)']

Unnamed: 0,names,Categories,modalities,pearson_r
36,Alcohol intake frequency. (2.0),Alcohol,"[jacobian, vbm, swi, rsfmri_0, rsfmri_7, rsfmr...","[0.07935210185576791, 0.07653801567002164, 0.0..."


In [46]:
for cat in sorted(list(set(df_vars_over_bThr.Categories.to_list()))):
    print("Category: {}".format(cat))
    print('------------------------------------------------')
    names = df_vars_over_bThr[df_vars_over_bThr.Categories == cat].names.to_list()
    for name in sorted(names):
        print('Var: {}'.format(name))
        mods = df_vars_over_bThr[(df_vars_over_bThr.Categories == cat) & (df_vars_over_bThr.names == name)].modalities.to_list()
        prs = df_vars_over_bThr[(df_vars_over_bThr.Categories == cat) & (df_vars_over_bThr.names == name)].pearson_r.to_list()
        mods0 = df_vars0_over_bThr[(df_vars0_over_bThr.Categories == cat) & (df_vars0_over_bThr.names == name)].modalities.to_list()
        prs0 = df_vars0_over_bThr[(df_vars0_over_bThr.Categories == cat) & (df_vars0_over_bThr.names == name)].pearson_r.to_list()
        print('   Modalities:')
        
        prs_abs = np.abs(prs).tolist()
        _, mods = zip(*sorted(zip(prs_abs[0], mods[0]), reverse=True))
        _, prs = zip(*sorted(zip(prs_abs[0], prs[0]), reverse=True))
        
        for i, mod in enumerate(mods):
            print('   -> {} ({:.3f})'.format(mod, prs[i]))
        
        
        if len(mods0) == 0:
            print('======== ATTENTION! NEW VAR ! ==========')
        else:
            prs_abs0 = np.abs(prs0).tolist()
            _, mods0 = zip(*sorted(zip(prs_abs0[0], mods0[0]), reverse=True))
            _, prs0 = zip(*sorted(zip(prs_abs0[0], prs0[0]), reverse=True))
        

#         for i, mod in enumerate(mods0):
#             print('   -> {} ({:.3f})'.format(mod, prs0[i]))
            print('   -> {} ({:.3f})'.format(mods0[0], prs0[0]))
        
    print('\n')

Category: Alcohol
------------------------------------------------
Var: Alcohol intake frequency. (0.0)
   Modalities:
   -> IC = 1 (0.087)
   -> tbss_L1_s (0.098)
Var: Alcohol intake frequency. (2.0)
   Modalities:
   -> IC = 1 (0.094)
   -> tbss_ISOVF_s (0.097)
Var: Average weekly red wine intake (0.0)
   Modalities:
   -> IC = 1 (0.083)
   -> tbss_FA_s (0.090)
Var: Frequency of drinking alcohol (0.0)
   Modalities:
   -> IC = 1 (0.114)
   -> tbss_MD (0.112)


Category: Blood Assays
------------------------------------------------
Var: C-reactive protein (0.0)
   Modalities:
   -> IC = 1 (-0.091)
   -> rsfmri_0 (-0.105)


Category: Cardiac & Circulartory Measurements
------------------------------------------------
Var: Cardiac index during PWA (2.0)
   Modalities:
   -> IC = 0 (-0.074)
   -> tbss_L1_s (0.091)
Var: Cardiac output during PWA (2.0)
   Modalities:
   -> IC = 0 (-0.077)
   -> tbss_L1_s (0.073)
Var: Central pulse pressure during PWA (2.0)
   Modalities:
   -> IC = 0 (-0.0

   Modalities:
   -> IC = 1 (-0.282)
   -> IC = 0 (-0.145)
   -> rsfmri_5 (-0.260)
Var: Femur shaft BMD (bone mineral density) (right) (2.0)
   Modalities:
   -> IC = 1 (-0.284)
   -> IC = 0 (-0.138)
   -> rsfmri_2 (-0.257)
Var: Femur total BMD (bone mineral density) (left) (2.0)
   Modalities:
   -> IC = 1 (-0.293)
   -> IC = 0 (-0.148)
   -> rsfmri_5 (-0.266)
Var: Femur total BMD (bone mineral density) (right) (2.0)
   Modalities:
   -> IC = 1 (-0.290)
   -> IC = 0 (-0.140)
   -> rsfmri_21 (-0.260)
Var: Femur total BMD (bone mineral density) T-score (left) (2.0)
   Modalities:
   -> IC = 1 (-0.294)
   -> IC = 0 (-0.149)
   -> rsfmri_5 (-0.267)
Var: Femur total BMD (bone mineral density) T-score (right) (2.0)
   Modalities:
   -> IC = 1 (-0.291)
   -> IC = 0 (-0.141)
   -> rsfmri_21 (-0.261)
Var: Femur troch BMD (bone mineral density) (left) (2.0)
   Modalities:
   -> IC = 1 (-0.255)
   -> IC = 0 (-0.129)
   -> rsfmri_5 (-0.229)
Var: Femur troch BMD (bone mineral density) (right) (2.0