In [1]:
import numpy as np
import pandas as pd
from scipy import stats
import sys
import argparse
import h5py
from scipy.stats import t as student_t
from statsmodels.stats import multitest as mt
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

In [2]:
# IDP_df = pd.read_pickle('male_IDP_prototype.pkl')
# vars_df = pd.read_pickle('male_vars_prototype.pkl')

IDP_df = pd.read_pickle('male_feature_direction_ICA_deltas_IDP_deconf_short.pkl')
vars_df = pd.read_pickle('male_feature_direction_ICA_deltas_nIDP_deconf_short.pkl')

IDP_names = np.loadtxt("IDP_names.txt", dtype=str, delimiter='\n')
IDP_categories = np.loadtxt("IDP_categories.txt", dtype=str, delimiter='\n')
with open('varsHeader.txt') as f:
    varsHeader = f.readlines()
    varsHeader = [l.strip('\n\r') for l in varsHeader]
    varsHeader = np.array(varsHeader)
vars_categories = np.loadtxt("vars_categories.txt", dtype=str, delimiter='\n')

In [3]:
vars_df.head()

Unnamed: 0,modality,bonf,fdr,dataframe
0,IC = 0,5.508233,4.139968,idx ...
1,IC = 1,5.508233,5.644288,idx ...
2,IC = 2,5.508233,,idx ...
3,IC = 3,5.508233,5.100579,idx ...
4,IC = 4,5.508233,3.494965,idx ...


In [4]:
vars_df.iloc[0].dataframe.head()

Unnamed: 0,idx,names,Categories,pearson_r,t_test_statistic,p_values_corrected,p_values,abs_pearson_r,log_p_values,log_p_values_corrected
0,0,Ethnic background (0.0),Ethnic Background,0.029002,1.913646,0.800401,0.055731,0.029002,1.253903,0.096693
1,1,Ethnic background (1.0),Ethnic Background,0.033657,0.92654,1.0,0.35446,0.033657,0.450432,-0.0
2,2,Ethnic background (2.0),Ethnic Background,-0.019294,0.701127,1.0,0.483347,0.019294,0.315741,-0.0
3,3,Genotype measurement batch (0.0),Genetic Markers,0.009149,0.598033,1.0,0.54985,0.009149,0.259756,-0.0
4,4,Heterozygosity (0.0),Genetic Markers,-0.012502,0.817222,1.0,0.413847,0.012502,0.38316,-0.0


In [5]:
vars_over_bThr_list = []

for idx in range(len(vars_df)):
    df = vars_df.iloc[idx].dataframe
    bThr = vars_df.iloc[idx].bonf
    vars_over_bThr = df[df.log_p_values>=bThr].sort_values('log_p_values', ascending=False).names.to_list()
    for var in vars_over_bThr:
        vars_over_bThr_list.append(var)

In [6]:
print("Number of unique vars across all modalities: ", len(set(vars_over_bThr_list)))

Number of unique vars across all modalities:  37


In [7]:
unique_vars_over_bTHr = list(set(vars_over_bThr_list))

In [8]:
unique_vars_over_bThr_categories = []

for var in unique_vars_over_bTHr:
    idx = list(varsHeader).index(var)
    category = vars_categories[idx]
    unique_vars_over_bThr_categories.append(category)

In [9]:
sorted(unique_vars_over_bTHr)

['Age first had sexual intercourse (0.0)',
 'Antibiotic codes for last 3 months (1140874138 - ciprofloxacin)',
 'Diabetes diagnosed by doctor (0.0)',
 'Diabetes diagnosed by doctor (2.0)',
 'Diagnoses - ICD10 (I10 - I10 Essential (primary) hypertension)',
 'Diagnoses - ICD10 (Z864 - Z86.4 Personal history of psychoactive substance abuse)',
 'Diagnoses - secondary ICD10 (E119 - E11.9 Without complications)',
 'Diagnoses - secondary ICD10 (I10 - I10 Essential (primary) hypertension)',
 'Diagnoses - secondary ICD10 (Z864 - Z86.4 Personal history of psychoactive substance abuse)',
 'Forced expiratory volume in 1-second (FEV1) (2.0)',
 'Forced expiratory volume in 1-second (FEV1) (2.1)',
 'Head BMC (bone mineral content) (2.0)',
 'Head BMD (bone mineral density) (2.0)',
 'IGF-1 (0.0)',
 'L1-L4 BMC (bone mineral content) (2.0)',
 'L1-L4 BMD (bone mineral density) (2.0)',
 'Legs BMD (bone mineral density) (2.0)',
 'Mean corpuscular haemoglobin (0.0)',
 'Non-cancer illness code, self-reported 

In [10]:
modalities = []
pearson_rs_list = []
for var in unique_vars_over_bTHr:
    modalities_with_var = []
    pearsonRs_with_var = []
    for idx in range(len(vars_df)):
        df = vars_df.iloc[idx].dataframe
        modality = vars_df.iloc[idx].modality
        bThr = vars_df.iloc[idx].bonf
        df = df[df.log_p_values>=bThr].sort_values('log_p_values', ascending=False)
        vars_over_bThr = df.names.to_list()
        pearson_r_over_bThr = df.pearson_r.to_list()
        if var in vars_over_bThr:
            modalities_with_var.append(modality)
            pearsonRs_with_var.append(pearson_r_over_bThr[vars_over_bThr.index(var)])
    modalities.append(modalities_with_var)
    pearson_rs_list.append(pearsonRs_with_var)

In [11]:
len(modalities)

37

In [12]:
df_vars_over_bThr = pd.DataFrame.from_dict({
    'names': unique_vars_over_bTHr,
    'Categories': unique_vars_over_bThr_categories,
    'modalities': modalities,
    'pearson_r': pearson_rs_list
})

In [13]:
df_vars_over_bThr.head()

Unnamed: 0,names,Categories,modalities,pearson_r
0,Taking other prescription medications (2.0),Medical History,[IC = 4],[0.07157813334326411]
1,Head BMD (bone mineral density) (2.0),Skeletal Measurements,"[IC = 0, IC = 1]","[-0.07993056789703824, -0.08904805258578735]"
2,Records in HES inpatient diagnoses dataset (0.0),Medical History,[IC = 4],[0.08415290697226151]
3,Treatment/medication code (1140884600 - metfor...,Medical History,[IC = 4],[0.09213896415821622]
4,Number of treatments/medications taken (2.0),Medical History,[IC = 4],[0.0814647267353815]


In [14]:
set(df_vars_over_bThr.Categories.to_list())

{'Blood Assays',
 'Cardiac & Circulartory Measurements',
 'Cognitive Tests',
 'Lifestyle',
 'Medical History',
 'Physical Measurements',
 'Skeletal Measurements'}

In [15]:
for cat in sorted(list(set(df_vars_over_bThr.Categories.to_list()))):
    print("Category: {}".format(cat))
    print('------------------------------------------------')
    names = df_vars_over_bThr[df_vars_over_bThr.Categories == cat].names.to_list()
    for name in sorted(names):
        print('Var: {}'.format(name))
        mods = df_vars_over_bThr[(df_vars_over_bThr.Categories == cat) & (df_vars_over_bThr.names == name)].modalities.to_list()
        prs = df_vars_over_bThr[(df_vars_over_bThr.Categories == cat) & (df_vars_over_bThr.names == name)].pearson_r.to_list()
        print('   Modalities:')
        for i, mod in enumerate(mods[0]):
            print('   -> {} ({:.3f})'.format(mod, prs[0][i]))
        
    print('\n')

Category: Blood Assays
------------------------------------------------
Var: IGF-1 (0.0)
   Modalities:
   -> IC = 4 (-0.084)
Var: Mean corpuscular haemoglobin (0.0)
   Modalities:
   -> IC = 3 (0.072)


Category: Cardiac & Circulartory Measurements
------------------------------------------------
Var: Systolic blood pressure, automated reading (0.0)
   Modalities:
   -> IC = 4 (0.076)
Var: Systolic blood pressure, automated reading (0.1)
   Modalities:
   -> IC = 4 (0.082)


Category: Cognitive Tests
------------------------------------------------
Var: Number of symbol digit matches attempted (2.0)
   Modalities:
   -> IC = 4 (-0.089)
Var: Number of symbol digit matches made correctly (2.0)
   Modalities:
   -> IC = 4 (-0.093)


Category: Lifestyle
------------------------------------------------
Var: Age first had sexual intercourse (0.0)
   Modalities:
   -> IC = 0 (0.079)


Category: Medical History
------------------------------------------------
Var: Antibiotic codes for last 3 

In [16]:
sorted(prs[0], reverse=True)

[-0.08493449380963183]

In [17]:
for cat in sorted(list(set(df_vars_over_bThr.Categories.to_list()))):
    print("Category: {}".format(cat))
    print('------------------------------------------------')
    names = df_vars_over_bThr[df_vars_over_bThr.Categories == cat].names.to_list()
    for name in sorted(names):
        print('Var: {}'.format(name))
        mods = df_vars_over_bThr[(df_vars_over_bThr.Categories == cat) & (df_vars_over_bThr.names == name)].modalities.to_list()
        prs = df_vars_over_bThr[(df_vars_over_bThr.Categories == cat) & (df_vars_over_bThr.names == name)].pearson_r.to_list()
        print('   Modalities:')
        
        prs_abs = np.abs(prs).tolist()
        _, mods = zip(*sorted(zip(prs_abs[0], mods[0]), reverse=True))
        _, prs = zip(*sorted(zip(prs_abs[0], prs[0]), reverse=True))
        
        for i, mod in enumerate(mods):
            print('   -> {} ({:.3f})'.format(mod, prs[i]))
        
    print('\n')

Category: Blood Assays
------------------------------------------------
Var: IGF-1 (0.0)
   Modalities:
   -> IC = 4 (-0.084)
Var: Mean corpuscular haemoglobin (0.0)
   Modalities:
   -> IC = 3 (0.072)


Category: Cardiac & Circulartory Measurements
------------------------------------------------
Var: Systolic blood pressure, automated reading (0.0)
   Modalities:
   -> IC = 4 (0.076)
Var: Systolic blood pressure, automated reading (0.1)
   Modalities:
   -> IC = 4 (0.082)


Category: Cognitive Tests
------------------------------------------------
Var: Number of symbol digit matches attempted (2.0)
   Modalities:
   -> IC = 4 (-0.089)
Var: Number of symbol digit matches made correctly (2.0)
   Modalities:
   -> IC = 4 (-0.093)


Category: Lifestyle
------------------------------------------------
Var: Age first had sexual intercourse (0.0)
   Modalities:
   -> IC = 0 (0.079)


Category: Medical History
------------------------------------------------
Var: Antibiotic codes for last 3 

In [18]:
variance_threshold = 0.1

for cat in sorted(list(set(df_vars_over_bThr.Categories.to_list()))):
    print("Category: {}".format(cat))
    print('------------------------------------------------')
    names = df_vars_over_bThr[df_vars_over_bThr.Categories == cat].names.to_list()
    for name in sorted(names):
        
        mods = df_vars_over_bThr[(df_vars_over_bThr.Categories == cat) & (df_vars_over_bThr.names == name)].modalities.to_list()
        prs = df_vars_over_bThr[(df_vars_over_bThr.Categories == cat) & (df_vars_over_bThr.names == name)].pearson_r.to_list()
        
        
        prs_abs = np.abs(prs).tolist()
        _, mods = zip(*sorted(zip(prs_abs[0], mods[0]), reverse=True))
        _, prs = zip(*sorted(zip(prs_abs[0], prs[0]), reverse=True))
        
        if prs[0] >= variance_threshold:
            print('Var: {}'.format(name))
            print('   Modalities:')
            for i, mod in enumerate(mods):
                if prs[i] >= variance_threshold:
                    print('   -> {} ({:.3f})'.format(mod, prs[i]))
        
    print('\n')

Category: Blood Assays
------------------------------------------------


Category: Cardiac & Circulartory Measurements
------------------------------------------------


Category: Cognitive Tests
------------------------------------------------


Category: Lifestyle
------------------------------------------------


Category: Medical History
------------------------------------------------
Var: Diabetes diagnosed by doctor (2.0)
   Modalities:
   -> IC = 4 (0.108)


Category: Physical Measurements
------------------------------------------------


Category: Skeletal Measurements
------------------------------------------------




In [19]:
mods

('IC = 0',)

In [20]:
counter = 0
for cat in sorted(list(set(df_vars_over_bThr.Categories.to_list()))):
    print("Category: {}".format(cat))
    print('------------------------------------------------')
    names = df_vars_over_bThr[df_vars_over_bThr.Categories == cat].names.to_list()
    for name in sorted(names):
        mods = df_vars_over_bThr[(df_vars_over_bThr.Categories == cat) & (df_vars_over_bThr.names == name)].modalities.to_list()[0]
        if 'ElasticNetFull' in mods or 'ElasticNetFull' in mods or 'ElasticNetFull' in mods:
            print('Var: {}'.format(name))
            counter += 1
        else:
            pass
    print('\n')
    
print("There are {} vars independent from the conventional modalities".format(counter))

Category: Blood Assays
------------------------------------------------


Category: Cardiac & Circulartory Measurements
------------------------------------------------


Category: Cognitive Tests
------------------------------------------------


Category: Lifestyle
------------------------------------------------


Category: Medical History
------------------------------------------------


Category: Physical Measurements
------------------------------------------------


Category: Skeletal Measurements
------------------------------------------------


There are 0 vars independent from the conventional modalities


In [21]:
IDP_df0 = pd.read_pickle('male_IDP.pkl')
vars_df0 = pd.read_pickle('male_vars.pkl')

In [22]:
vars0_over_bThr_list = []

for idx in range(len(vars_df0)):
    df = vars_df0.iloc[idx].dataframe
    bThr = vars_df0.iloc[idx].bonf
    vars_over_bThr = df[df.log_p_values>=bThr].sort_values('log_p_values', ascending=False).names.to_list()
    for var in vars_over_bThr:
        vars0_over_bThr_list.append(var)

In [23]:
print("Number of unique vars across all modalities: ", len(set(vars0_over_bThr_list)))

Number of unique vars across all modalities:  161


In [24]:
unique_vars0_over_bTHr = list(set(vars0_over_bThr_list))
unique_vars0_over_bThr_categories = []

for var in unique_vars0_over_bTHr:
    idx = list(varsHeader).index(var)
    category = vars_categories[idx]
    unique_vars0_over_bThr_categories.append(category)
    
modalities0 = []
pearson_rs_list0 = []
for var in unique_vars0_over_bTHr:
    modalities_with_var = []
    pearsonRs_with_var = []
    for idx in range(len(vars_df0)):
        df = vars_df0.iloc[idx].dataframe
        modality = vars_df0.iloc[idx].modality
        bThr = vars_df0.iloc[idx].bonf
        df = df[df.log_p_values>=bThr].sort_values('log_p_values', ascending=False)
        vars_over_bThr = df.names.to_list()
        pearson_r_over_bThr = df.pearson_r.to_list()
        if var in vars_over_bThr:
            modalities_with_var.append(modality)
            pearsonRs_with_var.append(pearson_r_over_bThr[vars_over_bThr.index(var)])
    modalities0.append(modalities_with_var)
    pearson_rs_list0.append(pearsonRs_with_var)

In [25]:
df_vars0_over_bThr = pd.DataFrame.from_dict({
    'names': unique_vars0_over_bTHr,
    'Categories': unique_vars0_over_bThr_categories,
    'modalities': modalities0,
    'pearson_r': pearson_rs_list0
})

In [26]:
df_vars0_over_bThr[df_vars0_over_bThr.names=='Alcohol intake frequency. (2.0)']

Unnamed: 0,names,Categories,modalities,pearson_r
3,Alcohol intake frequency. (2.0),Alcohol,"[tbss_FA_s, tbss_L1_s, tbss_MO_s, tbss_OD_s, t...","[0.08105822842776622, 0.0794058855339503, 0.07..."


In [27]:
for cat in sorted(list(set(df_vars_over_bThr.Categories.to_list()))):
    print("Category: {}".format(cat))
    print('------------------------------------------------')
    names = df_vars_over_bThr[df_vars_over_bThr.Categories == cat].names.to_list()
    for name in sorted(names):
        print('Var: {}'.format(name))
        mods = df_vars_over_bThr[(df_vars_over_bThr.Categories == cat) & (df_vars_over_bThr.names == name)].modalities.to_list()
        prs = df_vars_over_bThr[(df_vars_over_bThr.Categories == cat) & (df_vars_over_bThr.names == name)].pearson_r.to_list()
        mods0 = df_vars0_over_bThr[(df_vars0_over_bThr.Categories == cat) & (df_vars0_over_bThr.names == name)].modalities.to_list()
        prs0 = df_vars0_over_bThr[(df_vars0_over_bThr.Categories == cat) & (df_vars0_over_bThr.names == name)].pearson_r.to_list()
        print('   Modalities:')
        
        prs_abs = np.abs(prs).tolist()
        _, mods = zip(*sorted(zip(prs_abs[0], mods[0]), reverse=True))
        _, prs = zip(*sorted(zip(prs_abs[0], prs[0]), reverse=True))
        
        for i, mod in enumerate(mods):
            print('   -> {} ({:.3f})'.format(mod, prs[i]))
        
        
        if len(mods0) == 0:
            print('======== ATTENTION! NEW VAR ! ==========')
        else:
            prs_abs0 = np.abs(prs0).tolist()
            _, mods0 = zip(*sorted(zip(prs_abs0[0], mods0[0]), reverse=True))
            _, prs0 = zip(*sorted(zip(prs_abs0[0], prs0[0]), reverse=True))
        

#         for i, mod in enumerate(mods0):
#             print('   -> {} ({:.3f})'.format(mod, prs0[i]))
            print('   -> {} ({:.3f})'.format(mods0[0], prs0[0]))
        
    print('\n')

Category: Blood Assays
------------------------------------------------
Var: IGF-1 (0.0)
   Modalities:
   -> IC = 4 (-0.084)
   -> tbss_FA (-0.094)
Var: Mean corpuscular haemoglobin (0.0)
   Modalities:
   -> IC = 3 (0.072)
   -> tbss_MD (0.097)


Category: Cardiac & Circulartory Measurements
------------------------------------------------
Var: Systolic blood pressure, automated reading (0.0)
   Modalities:
   -> IC = 4 (0.076)
   -> tbss_L3_s (0.096)
Var: Systolic blood pressure, automated reading (0.1)
   Modalities:
   -> IC = 4 (0.082)
   -> tbss_L3_s (0.110)


Category: Cognitive Tests
------------------------------------------------
Var: Number of symbol digit matches attempted (2.0)
   Modalities:
   -> IC = 4 (-0.089)
   -> tfmri_1 (-0.107)
Var: Number of symbol digit matches made correctly (2.0)
   Modalities:
   -> IC = 4 (-0.093)
   -> tfmri_c_1 (-0.110)


Category: Lifestyle
------------------------------------------------
Var: Age first had sexual intercourse (0.0)
   Mo