In [1]:
import numpy as np
import pandas as pd
from scipy import stats
import sys
import argparse
import h5py
from scipy.stats import t as student_t
from statsmodels.stats import multitest as mt
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

In [2]:
vars_df = pd.read_pickle('male_subject_direction_ICA_deltas_nIDP_STEVEnorm_deconf.pkl')

with open('varsHeader.txt') as f:
    varsHeader = f.readlines()
    varsHeader = [l.strip('\n\r') for l in varsHeader]
    varsHeader = np.array(varsHeader)
vars_categories = np.loadtxt("vars_categories.txt", dtype=str, delimiter='\n')

In [3]:
vars_df

Unnamed: 0,modality,bonf,fdr,dataframe
0,IC = 0,5.508233,3.454135,idx ...
1,IC = 1,5.508233,3.465645,idx ...


In [4]:
vars_df.iloc[0].dataframe

Unnamed: 0,idx,names,Categories,pearson_r,t_test_statistic,p_values_corrected,p_values,abs_pearson_r,log_p_values,log_p_values_corrected
0,0,Ethnic background (0.0),Ethnic Background,0.022909,1.511375,0.838967,0.130766,0.022909,0.883507,0.076255
1,1,Ethnic background (1.0),Ethnic Background,0.055081,1.517769,0.838620,0.129490,0.055081,0.887763,0.076435
2,2,Ethnic background (2.0),Ethnic Background,-0.026789,0.973647,0.978541,0.330410,0.026789,0.480947,0.009421
3,3,Genotype measurement batch (0.0),Genetic Markers,0.014719,0.962168,0.980593,0.336020,0.014719,0.473635,0.008511
4,4,Heterozygosity (0.0),Genetic Markers,-0.008991,0.587682,1.000000,0.556777,0.008991,0.254319,-0.000000
...,...,...,...,...,...,...,...,...,...,...
16109,17516,Activities undertaken to treat anxiety (0.1),Mental Health,-0.008666,0.155024,1.000000,0.876900,0.008666,0.057050,-0.000000
16110,17518,Methods of self-harm used (0.1),Mental Health,-0.000704,0.006455,1.000000,0.994865,0.000704,0.002236,-0.000000
16111,17521,Actions taken following self-harm (0.1),Mental Health,0.180968,1.261480,0.925228,0.213361,0.180968,0.670885,0.033751
16112,17525,Ever been offered/sought treatment for anxiety...,Mental Health,-0.002792,0.160382,1.000000,0.872590,0.002792,0.059190,-0.000000


In [5]:
current_mods = vars_df.modality.values
modality_names = [
    'Subject-Direction ICA Component 1',
    'Subject-Direction ICA Component 2'
]

for idx, mod in enumerate(current_mods):
    vars_df.loc[vars_df["modality"] == mod, "modality"] = modality_names[idx]
    

In [6]:
# idxs = [1, 2, 14, 15]
# idxs = [3,4,5,6,7,8,9,10,11,12,13]

ordered_cols = [
     'Modality',
     'bonf',
     'passes_bonf',
     'fdr',
     'passes_fdr',
     'idx',
     'names',
     'Categories',
     'pearson_r',
     'abs_pearson_r',
     't_test_statistic',
     'p_values',
     'p_values_corrected',
     'log_p_values',
     'log_p_values_corrected', 
]

# for idx in idxs:
#     if idx == idxs[0]:
for idx in range(len(vars_df)):
    if idx == 0:
        df = vars_df.iloc[idx].dataframe
        df['Modality'] = vars_df.iloc[idx].modality
        df['bonf'] = vars_df.iloc[idx].bonf
        df['fdr'] = vars_df.iloc[idx].fdr
        df['passes_bonf'] = df.log_p_values > df.bonf
        df['passes_fdr'] = df.log_p_values > df.fdr
        df = df[ordered_cols]
        df = df.drop(columns=['abs_pearson_r'])
    else:
        df2 = vars_df.iloc[idx].dataframe
        df2['Modality'] = vars_df.iloc[idx].modality
        df2['bonf'] = vars_df.iloc[idx].bonf
        df2['fdr'] = vars_df.iloc[idx].fdr
        df2['passes_bonf'] = df2.log_p_values > df2.bonf
        df2['passes_fdr'] = df2.log_p_values > df2.fdr
        df2 = df2[ordered_cols]
        df2 = df2.drop(columns=['abs_pearson_r'])
        df = pd.concat([df, df2])
        del df2

In [7]:
# categories = sorted(list(set(df.Categories.to_list())))

# first_df = True

# for cat in categories:
#     df_tmp = df[(df.Categories==cat) & (df.passes_bonf==True)].sort_values('log_p_values', ascending=False)
#     df_tmp = df_tmp.head(10)
#     if len(df_tmp) == 0:
#         continue
#     if first_df == True:
#         df_print = df_tmp
#         first_df = False
#     else:
#         df_print = pd.concat([df_print, df_tmp])
        
# df_print = df_print.drop(columns=[
#     'bonf', 'passes_bonf', 'fdr', 'passes_fdr', 'idx', 't_test_statistic', 'p_values',
#     'p_values_corrected', 'log_p_values_corrected'
# ])

# df_print_colums_order = [
#     'Modality', 'log_p_values', 'pearson_r', 'Categories', 'names'
# ]

# df_print = df_print[df_print_colums_order]

# df_print = df_print.round(decimals=3)

# df_print = df_print.rename(
#     columns={
#         'log_p_values' : '-log(p)',
#         'pearson_r' : 'Correlation (r)',
#         'Categories': 'Variable Category',
#         'names' : 'Variable Description'
#     }
# )

# df_print



In [8]:
# df_print.to_csv('Famale_top_nIDPs.csv', index=False)

In [9]:
df

Unnamed: 0,Modality,bonf,passes_bonf,fdr,passes_fdr,idx,names,Categories,pearson_r,t_test_statistic,p_values,p_values_corrected,log_p_values,log_p_values_corrected
0,Subject-Direction ICA Component 1,5.508233,False,3.454135,False,0,Ethnic background (0.0),Ethnic Background,0.022909,1.511375,0.130766,0.838967,0.883507,0.076255
1,Subject-Direction ICA Component 1,5.508233,False,3.454135,False,1,Ethnic background (1.0),Ethnic Background,0.055081,1.517769,0.129490,0.838620,0.887763,0.076435
2,Subject-Direction ICA Component 1,5.508233,False,3.454135,False,2,Ethnic background (2.0),Ethnic Background,-0.026789,0.973647,0.330410,0.978541,0.480947,0.009421
3,Subject-Direction ICA Component 1,5.508233,False,3.454135,False,3,Genotype measurement batch (0.0),Genetic Markers,0.014719,0.962168,0.336020,0.980593,0.473635,0.008511
4,Subject-Direction ICA Component 1,5.508233,False,3.454135,False,4,Heterozygosity (0.0),Genetic Markers,-0.008991,0.587682,0.556777,1.000000,0.254319,-0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16109,Subject-Direction ICA Component 2,5.508233,False,3.465645,False,17516,Activities undertaken to treat anxiety (0.1),Mental Health,0.080970,1.453210,0.147146,0.885413,0.832253,0.052854
16110,Subject-Direction ICA Component 2,5.508233,False,3.465645,False,17518,Methods of self-harm used (0.1),Mental Health,-0.002601,0.023839,0.981038,1.000000,0.008314,-0.000000
16111,Subject-Direction ICA Component 2,5.508233,False,3.465645,False,17521,Actions taken following self-harm (0.1),Mental Health,0.111819,0.771431,0.444313,1.000000,0.352311,-0.000000
16112,Subject-Direction ICA Component 2,5.508233,False,3.465645,False,17525,Ever been offered/sought treatment for anxiety...,Mental Health,0.000253,0.014520,0.988416,1.000000,0.005060,-0.000000


In [10]:
df = df.rename(
    columns={
        'log_p_values' : '-log(P)',
        'pearson_r' : 'Correlation (r)',
        'Categories': 'Variable Category',
        'names' : 'Variable Description',
        'bonf': 'Bonferroni Threshold',
        'passes_bonf': 'Passes Bonf.',
        'fdr': 'False Discovery Rate (FDR)',
        'passes_fdr': 'Passes FDR',
        'idx': 'Caterogy ID',
        't_test_statistic': 'T-Values',
        'p_values': 'P-Values',
        'p_values_corrected': 'FDR Adjusted P-Value',
        'log_p_values_corrected': 'FDR Adjusted -log(P)'
    }
)

df

Unnamed: 0,Modality,Bonferroni Threshold,Passes Bonf.,False Discovery Rate (FDR),Passes FDR,Caterogy ID,Variable Description,Variable Category,Correlation (r),T-Values,P-Values,FDR Adjusted P-Value,-log(P),FDR Adjusted -log(P)
0,Subject-Direction ICA Component 1,5.508233,False,3.454135,False,0,Ethnic background (0.0),Ethnic Background,0.022909,1.511375,0.130766,0.838967,0.883507,0.076255
1,Subject-Direction ICA Component 1,5.508233,False,3.454135,False,1,Ethnic background (1.0),Ethnic Background,0.055081,1.517769,0.129490,0.838620,0.887763,0.076435
2,Subject-Direction ICA Component 1,5.508233,False,3.454135,False,2,Ethnic background (2.0),Ethnic Background,-0.026789,0.973647,0.330410,0.978541,0.480947,0.009421
3,Subject-Direction ICA Component 1,5.508233,False,3.454135,False,3,Genotype measurement batch (0.0),Genetic Markers,0.014719,0.962168,0.336020,0.980593,0.473635,0.008511
4,Subject-Direction ICA Component 1,5.508233,False,3.454135,False,4,Heterozygosity (0.0),Genetic Markers,-0.008991,0.587682,0.556777,1.000000,0.254319,-0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16109,Subject-Direction ICA Component 2,5.508233,False,3.465645,False,17516,Activities undertaken to treat anxiety (0.1),Mental Health,0.080970,1.453210,0.147146,0.885413,0.832253,0.052854
16110,Subject-Direction ICA Component 2,5.508233,False,3.465645,False,17518,Methods of self-harm used (0.1),Mental Health,-0.002601,0.023839,0.981038,1.000000,0.008314,-0.000000
16111,Subject-Direction ICA Component 2,5.508233,False,3.465645,False,17521,Actions taken following self-harm (0.1),Mental Health,0.111819,0.771431,0.444313,1.000000,0.352311,-0.000000
16112,Subject-Direction ICA Component 2,5.508233,False,3.465645,False,17525,Ever been offered/sought treatment for anxiety...,Mental Health,0.000253,0.014520,0.988416,1.000000,0.005060,-0.000000


In [11]:
df.to_csv('Male_ALL_nIDPs_ICA.csv', index=False)

In [12]:
df_passing_FDR = df[df['Passes FDR']==True]

In [13]:
df_passing_FDR

Unnamed: 0,Modality,Bonferroni Threshold,Passes Bonf.,False Discovery Rate (FDR),Passes FDR,Caterogy ID,Variable Description,Variable Category,Correlation (r),T-Values,P-Values,FDR Adjusted P-Value,-log(P),FDR Adjusted -log(P)
177,Subject-Direction ICA Component 1,5.508233,True,3.454135,True,182,Age first had sexual intercourse (0.0),Lifestyle,0.074673,4.793636,0.000002,0.000942,5.770593,3.025788
179,Subject-Direction ICA Component 1,5.508233,False,3.454135,True,184,Age first had sexual intercourse (2.0),Lifestyle,0.064507,4.032729,0.000056,0.012635,4.250346,1.898412
280,Subject-Direction ICA Component 1,5.508233,False,3.454135,True,391,How are people in household related to partici...,Lifestyle,-0.087322,3.618440,0.000305,0.042742,3.515655,1.369150
362,Subject-Direction ICA Component 1,5.508233,False,3.454135,True,524,Year ended full time education (0.0),Lifestyle,0.079452,3.853073,0.000120,0.021447,3.921593,1.668633
861,Subject-Direction ICA Component 1,5.508233,False,3.454135,True,1153,Cereal intake (0.0),Diet,0.068813,4.554030,0.000005,0.002355,5.267093,2.628091
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9747,Subject-Direction ICA Component 2,5.508233,False,3.465645,True,11082,Diagnoses - main ICD10 (K221 - K22.1 Ulcer of ...,Medical History,-0.054506,3.605651,0.000315,0.044118,3.501892,1.355386
10134,Subject-Direction ICA Component 2,5.508233,False,3.465645,True,11470,Diagnoses - secondary ICD10 (E119 - E11.9 With...,Medical History,-0.064609,4.276528,0.000019,0.004339,4.712441,2.362570
12943,Subject-Direction ICA Component 2,5.508233,False,3.465645,True,14315,Diagnoses - ICD10 (E119 - E11.9 Without compli...,Medical History,-0.058797,3.890482,0.000102,0.015732,3.993379,1.803209
13366,Subject-Direction ICA Component 2,5.508233,False,3.465645,True,14738,Diagnoses - ICD10 (K221 - K22.1 Ulcer of oesop...,Medical History,-0.060983,4.035601,0.000055,0.010144,4.256524,1.993803


In [14]:
df_passing_FDR.to_csv('Male_ALL_nIDPs_passing_FDR_ICA.csv', index=False)

In [15]:
df_female_passing_FDR = pd.read_csv('../Analsysis Female New/Famale_ALL_nIDPs_passing_FDR_ICA.csv')
df_passing_FDR['Sex'] = 'Male'
df_female_passing_FDR['Sex'] = 'Female'
first_column = df_passing_FDR.pop('Sex')
df_passing_FDR.insert(0, 'Sex', first_column)
first_column = df_female_passing_FDR.pop('Sex')
df_female_passing_FDR.insert(0, 'Sex', first_column)
df_ALL_passing_FDR = pd.concat([df_female_passing_FDR, df_passing_FDR])
df_ALL_passing_FDR.to_csv('MALE_FEMALE_ALL_nIDPs_passing_FDR_ICA.csv', index=False)
df_ALL_passing_FDR

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_passing_FDR['Sex'] = 'Male'


Unnamed: 0,Sex,Modality,Bonferroni Threshold,Passes Bonf.,False Discovery Rate (FDR),Passes FDR,Caterogy ID,Variable Description,Variable Category,Correlation (r),T-Values,P-Values,FDR Adjusted P-Value,-log(P),FDR Adjusted -log(P)
0,Female,Subject-Direction ICA Component 1,5.509794,False,3.524662,True,166,Father's age at death (2.0),Lifestyle,0.058138,3.737560,1.883351e-04,0.034611,3.725069,1.460788
1,Female,Subject-Direction ICA Component 1,5.509794,True,3.524662,True,1131,Pork intake (2.0),Diet,-0.073752,5.217796,1.884126e-07,0.000102,6.724890,3.992428
2,Female,Subject-Direction ICA Component 1,5.509794,True,3.524662,True,2778,Alcohol intake frequency. (0.0),Alcohol,-0.078131,5.551083,2.985040e-08,0.000025,7.525050,4.595040
3,Female,Subject-Direction ICA Component 1,5.509794,True,3.524662,True,2780,Alcohol intake frequency. (2.0),Alcohol,-0.076838,5.442281,5.511884e-08,0.000045,7.258700,4.350966
4,Female,Subject-Direction ICA Component 1,5.509794,False,3.524662,True,2782,Average weekly red wine intake (0.0),Alcohol,-0.064091,3.822212,1.345326e-04,0.030643,3.871172,1.513667
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9747,Male,Subject-Direction ICA Component 2,5.508233,False,3.465645,True,11082,Diagnoses - main ICD10 (K221 - K22.1 Ulcer of ...,Medical History,-0.054506,3.605651,3.148533e-04,0.044118,3.501892,1.355386
10134,Male,Subject-Direction ICA Component 2,5.508233,False,3.465645,True,11470,Diagnoses - secondary ICD10 (E119 - E11.9 With...,Medical History,-0.064609,4.276528,1.938914e-05,0.004339,4.712441,2.362570
12943,Male,Subject-Direction ICA Component 2,5.508233,False,3.465645,True,14315,Diagnoses - ICD10 (E119 - E11.9 Without compli...,Medical History,-0.058797,3.890482,1.015362e-04,0.015732,3.993379,1.803209
13366,Male,Subject-Direction ICA Component 2,5.508233,False,3.465645,True,14738,Diagnoses - ICD10 (K221 - K22.1 Ulcer of oesop...,Medical History,-0.060983,4.035601,5.539575e-05,0.010144,4.256524,1.993803


In [16]:
df_female = pd.read_csv('../Analsysis Female New/Famale_ALL_nIDPs_ICA.csv')
df['Sex'] = 'Male'
df_female['Sex'] = 'Female'
first_column = df.pop('Sex')
df.insert(0, 'Sex', first_column)
first_column = df_female.pop('Sex')
df_female.insert(0, 'Sex', first_column)
df_ALL = pd.concat([df_female, df])
df_ALL.to_csv('MALE_FEMALE_ALL_nIDPs_ICA.csv', index=False)
df_ALL



Unnamed: 0,Sex,Modality,Bonferroni Threshold,Passes Bonf.,False Discovery Rate (FDR),Passes FDR,Caterogy ID,Variable Description,Variable Category,Correlation (r),T-Values,P-Values,FDR Adjusted P-Value,-log(P),FDR Adjusted -log(P)
0,Female,Subject-Direction ICA Component 1,5.509794,False,3.524662,False,0,Ethnic background (0.0),Ethnic Background,0.027647,1.957215,0.050378,0.627484,1.297762,0.202398
1,Female,Subject-Direction ICA Component 1,5.509794,False,3.524662,False,1,Ethnic background (1.0),Ethnic Background,0.035055,1.014186,0.310787,0.919212,0.507537,0.036584
2,Female,Subject-Direction ICA Component 1,5.509794,False,3.524662,False,2,Ethnic background (2.0),Ethnic Background,0.039270,1.482522,0.138423,0.793821,0.858792,0.100277
3,Female,Subject-Direction ICA Component 1,5.509794,False,3.524662,False,3,Genotype measurement batch (0.0),Genetic Markers,0.049738,3.478130,0.000509,0.073135,3.292996,1.135872
4,Female,Subject-Direction ICA Component 1,5.509794,False,3.524662,False,4,Heterozygosity (0.0),Genetic Markers,0.009976,0.696801,0.485960,0.984174,0.313399,0.006928
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16109,Male,Subject-Direction ICA Component 2,5.508233,False,3.465645,False,17516,Activities undertaken to treat anxiety (0.1),Mental Health,0.080970,1.453210,0.147146,0.885413,0.832253,0.052854
16110,Male,Subject-Direction ICA Component 2,5.508233,False,3.465645,False,17518,Methods of self-harm used (0.1),Mental Health,-0.002601,0.023839,0.981038,1.000000,0.008314,-0.000000
16111,Male,Subject-Direction ICA Component 2,5.508233,False,3.465645,False,17521,Actions taken following self-harm (0.1),Mental Health,0.111819,0.771431,0.444313,1.000000,0.352311,-0.000000
16112,Male,Subject-Direction ICA Component 2,5.508233,False,3.465645,False,17525,Ever been offered/sought treatment for anxiety...,Mental Health,0.000253,0.014520,0.988416,1.000000,0.005060,-0.000000
