In [1]:
import numpy as np
import pandas as pd
from scipy import stats
import sys
import argparse
import h5py
from scipy.stats import t as student_t
from statsmodels.stats import multitest as mt
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

In [2]:
vars_df = pd.read_pickle('male_vars_ensembles_testset.pkl')

with open('varsHeader.txt') as f:
    varsHeader = f.readlines()
    varsHeader = [l.strip('\n\r') for l in varsHeader]
    varsHeader = np.array(varsHeader)
vars_categories = np.loadtxt("vars_categories.txt", dtype=str, delimiter='\n')

In [3]:
vars_df

Unnamed: 0,modality,bonf,fdr,dataframe
0,NaiveFull,5.508287,3.93305,idx ...
1,LinearRegressionFull,5.508287,4.485197,idx ...
2,ElasticNetFull,5.508287,4.765178,idx ...
3,Cluster1,5.508287,4.108133,idx ...
4,Cluster2,5.508287,4.502409,idx ...
5,Cluster3,5.508287,4.675435,idx ...
6,Cluster4,5.508287,4.754493,idx ...
7,Cluster5,5.508287,4.644724,idx ...
8,Cluster6,5.508287,,idx ...
9,Cluster7,5.508287,3.800096,idx ...


In [4]:
vars_df.iloc[0].dataframe

Unnamed: 0,idx,names,Categories,pearson_r,t_test_statistic,p_values_corrected,p_values,abs_pearson_r,log_p_values,log_p_values_corrected
0,0,Ethnic background (0.0),Ethnic Background,0.016286,0.759648,1.000000,0.447547,0.016286,0.349161,-0.000000
1,1,Ethnic background (1.0),Ethnic Background,-0.005304,0.104620,1.000000,0.916732,0.005304,0.037758,-0.000000
2,2,Ethnic background (2.0),Ethnic Background,0.059198,1.497888,0.945546,0.134657,0.059198,0.870772,0.024317
3,3,Genotype measurement batch (0.0),Genetic Markers,-0.024794,1.145993,1.000000,0.251927,0.024794,0.598726,-0.000000
4,4,Heterozygosity (0.0),Genetic Markers,-0.011541,0.533297,1.000000,0.593884,0.011541,0.226299,-0.000000
...,...,...,...,...,...,...,...,...,...,...
16111,17518,Methods of self-harm used (0.1),Mental Health,0.011187,0.075049,1.000000,0.940509,0.011187,0.026637,-0.000000
16112,17521,Actions taken following self-harm (0.1),Mental Health,-0.305485,1.635876,0.917416,0.113916,0.305485,0.943417,0.037434
16113,17522,Actions taken following self-harm (0.2),Mental Health,-0.386420,1.919931,0.832234,0.068554,0.386420,1.163970,0.079754
16114,17525,Ever been offered/sought treatment for anxiety...,Mental Health,-0.030560,1.243830,1.000000,0.213738,0.030560,0.670117,-0.000000


In [5]:
current_mods = vars_df.modality.values
modality_names = [
    'Naive Average', 'Linear Regression', 'ElasticNet', 'Cluster 1', 'Cluster 2', 'Cluster 3', 'Cluster 4', 'Cluster 5', 'Cluster 6',
    'Cluster 7', 'Cluster 8', 'Cluster 9', 'Cluster 10', 'Cluster 11', 'RENT', 'MLP'
]

for idx, mod in enumerate(current_mods):
    vars_df.loc[vars_df["modality"] == mod, "modality"] = modality_names[idx]
    

In [6]:
# idxs = [1, 2, 14, 15]
# # idxs = [3,4,5,6,7,8,9,10,11,12,13]

ordered_cols = [
     'Modality',
     'bonf',
     'passes_bonf',
     'fdr',
     'passes_fdr',
     'idx',
     'names',
     'Categories',
     'pearson_r',
     'abs_pearson_r',
     't_test_statistic',
     'p_values',
     'p_values_corrected',
     'log_p_values',
     'log_p_values_corrected', 
]

# for idx in idxs:
#     if idx == idxs[0]:
for idx in range(len(vars_df)):
    if idx == 0:
        df = vars_df.iloc[idx].dataframe
        df['Modality'] = vars_df.iloc[idx].modality
        df['bonf'] = vars_df.iloc[idx].bonf
        df['fdr'] = vars_df.iloc[idx].fdr
        df['passes_bonf'] = df.log_p_values > df.bonf
        df['passes_fdr'] = df.log_p_values > df.fdr
        df = df[ordered_cols]
        df = df.drop(columns=['abs_pearson_r'])
    else:
        df2 = vars_df.iloc[idx].dataframe
        df2['Modality'] = vars_df.iloc[idx].modality
        df2['bonf'] = vars_df.iloc[idx].bonf
        df2['fdr'] = vars_df.iloc[idx].fdr
        df2['passes_bonf'] = df2.log_p_values > df2.bonf
        df2['passes_fdr'] = df2.log_p_values > df2.fdr
        df2 = df2[ordered_cols]
        df2 = df2.drop(columns=['abs_pearson_r'])
        df = pd.concat([df, df2])
        del df2

In [7]:
# categories = sorted(list(set(df.Categories.to_list())))

# first_df = True

# for cat in categories:
#     df_tmp = df[(df.Categories==cat) & (df.passes_bonf==True)].sort_values('log_p_values', ascending=False)
#     df_tmp = df_tmp.head(10)
#     if len(df_tmp) == 0:
#         continue
#     if first_df == True:
#         df_print = df_tmp
#         first_df = False
#     else:
#         df_print = pd.concat([df_print, df_tmp])
        
# df_print = df_print.drop(columns=[
#     'bonf', 'passes_bonf', 'fdr', 'passes_fdr', 'idx', 't_test_statistic', 'p_values',
#     'p_values_corrected', 'log_p_values_corrected'
# ])

# df_print_colums_order = [
#     'Modality', 'log_p_values', 'pearson_r', 'Categories', 'names'
# ]

# df_print = df_print[df_print_colums_order]

# df_print = df_print.round(decimals=3)

# df_print = df_print.rename(
#     columns={
#         'log_p_values' : '-log(p)',
#         'pearson_r' : 'Correlation (r)',
#         'Categories': 'Variable Category',
#         'names' : 'Variable Description'
#     }
# )

# df_print



In [8]:
# df_print.to_csv('Famale_top_nIDPs.csv', index=False)

In [9]:
df

Unnamed: 0,Modality,bonf,passes_bonf,fdr,passes_fdr,idx,names,Categories,pearson_r,t_test_statistic,p_values,p_values_corrected,log_p_values,log_p_values_corrected
0,Naive Average,5.508287,False,3.933050,False,0,Ethnic background (0.0),Ethnic Background,0.016286,0.759648,0.447547,1.000000,0.349161,-0.000000
1,Naive Average,5.508287,False,3.933050,False,1,Ethnic background (1.0),Ethnic Background,-0.005304,0.104620,0.916732,1.000000,0.037758,-0.000000
2,Naive Average,5.508287,False,3.933050,False,2,Ethnic background (2.0),Ethnic Background,0.059198,1.497888,0.134657,0.945546,0.870772,0.024317
3,Naive Average,5.508287,False,3.933050,False,3,Genotype measurement batch (0.0),Genetic Markers,-0.024794,1.145993,0.251927,1.000000,0.598726,-0.000000
4,Naive Average,5.508287,False,3.933050,False,4,Heterozygosity (0.0),Genetic Markers,-0.011541,0.533297,0.593884,1.000000,0.226299,-0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16111,MLP,5.508287,False,4.280427,False,17518,Methods of self-harm used (0.1),Mental Health,-0.086027,0.579235,0.565319,1.000000,0.247706,-0.000000
16112,MLP,5.508287,False,4.280427,False,17521,Actions taken following self-harm (0.1),Mental Health,-0.351896,1.916932,0.066298,0.831489,1.178498,0.080143
16113,MLP,5.508287,False,4.280427,False,17522,Actions taken following self-harm (0.2),Mental Health,-0.406421,2.038398,0.054300,0.795836,1.265202,0.099176
16114,MLP,5.508287,False,4.280427,False,17525,Ever been offered/sought treatment for anxiety...,Mental Health,-0.031463,1.280621,0.200506,1.000000,0.697872,-0.000000


In [10]:
df = df.rename(
    columns={
        'log_p_values' : '-log(P)',
        'pearson_r' : 'Correlation (r)',
        'Categories': 'Variable Category',
        'names' : 'Variable Description',
        'bonf': 'Bonferroni Threshold',
        'passes_bonf': 'Passes Bonf.',
        'fdr': 'False Discovery Rate (FDR)',
        'passes_fdr': 'Passes FDR',
        'idx': 'Caterogy ID',
        't_test_statistic': 'T-Values',
        'p_values': 'P-Values',
        'p_values_corrected': 'FDR Adjusted P-Value',
        'log_p_values_corrected': 'FDR Adjusted -log(P)'
    }
)

df

Unnamed: 0,Modality,Bonferroni Threshold,Passes Bonf.,False Discovery Rate (FDR),Passes FDR,Caterogy ID,Variable Description,Variable Category,Correlation (r),T-Values,P-Values,FDR Adjusted P-Value,-log(P),FDR Adjusted -log(P)
0,Naive Average,5.508287,False,3.933050,False,0,Ethnic background (0.0),Ethnic Background,0.016286,0.759648,0.447547,1.000000,0.349161,-0.000000
1,Naive Average,5.508287,False,3.933050,False,1,Ethnic background (1.0),Ethnic Background,-0.005304,0.104620,0.916732,1.000000,0.037758,-0.000000
2,Naive Average,5.508287,False,3.933050,False,2,Ethnic background (2.0),Ethnic Background,0.059198,1.497888,0.134657,0.945546,0.870772,0.024317
3,Naive Average,5.508287,False,3.933050,False,3,Genotype measurement batch (0.0),Genetic Markers,-0.024794,1.145993,0.251927,1.000000,0.598726,-0.000000
4,Naive Average,5.508287,False,3.933050,False,4,Heterozygosity (0.0),Genetic Markers,-0.011541,0.533297,0.593884,1.000000,0.226299,-0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16111,MLP,5.508287,False,4.280427,False,17518,Methods of self-harm used (0.1),Mental Health,-0.086027,0.579235,0.565319,1.000000,0.247706,-0.000000
16112,MLP,5.508287,False,4.280427,False,17521,Actions taken following self-harm (0.1),Mental Health,-0.351896,1.916932,0.066298,0.831489,1.178498,0.080143
16113,MLP,5.508287,False,4.280427,False,17522,Actions taken following self-harm (0.2),Mental Health,-0.406421,2.038398,0.054300,0.795836,1.265202,0.099176
16114,MLP,5.508287,False,4.280427,False,17525,Ever been offered/sought treatment for anxiety...,Mental Health,-0.031463,1.280621,0.200506,1.000000,0.697872,-0.000000


In [12]:
df.to_csv('Male_ALL_nIDPs_ENSEMBLES.csv', index=False)

In [13]:
df_passing_FDR = df[df['Passes FDR']==True]

In [14]:
df_passing_FDR

Unnamed: 0,Modality,Bonferroni Threshold,Passes Bonf.,False Discovery Rate (FDR),Passes FDR,Caterogy ID,Variable Description,Variable Category,Correlation (r),T-Values,P-Values,FDR Adjusted P-Value,-log(P),FDR Adjusted -log(P)
119,Naive Average,5.508287,False,3.933050,True,124,Type of accommodation lived in (0.0),Lifestyle,0.084315,3.950781,0.000080,0.043031,4.094888,1.366215
121,Naive Average,5.508287,False,3.933050,True,126,Type of accommodation lived in (2.0),Lifestyle,0.088853,4.147814,0.000035,0.028094,4.457621,1.551394
857,Naive Average,5.508287,False,3.933050,True,1153,Cereal intake (0.0),Diet,-0.090998,4.264490,0.000021,0.022446,4.680024,1.648858
859,Naive Average,5.508287,False,3.933050,True,1155,Cereal intake (2.0),Diet,-0.099109,4.626804,0.000004,0.009056,5.405202,2.043042
2328,Naive Average,5.508287,False,3.933050,True,2778,Alcohol intake frequency. (0.0),Alcohol,0.085555,4.007449,0.000063,0.037867,4.197630,1.421736
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10135,MLP,5.508287,False,4.280427,True,11470,Diagnoses - secondary ICD10 (E119 - E11.9 With...,Medical History,0.086962,4.076691,0.000047,0.038972,4.324967,1.409251
12473,MLP,5.508287,False,4.280427,True,13810,Operative procedures - secondary OPCS4 (Z573 -...,Medical History,-0.087190,4.087449,0.000045,0.038972,4.344923,1.409251
12651,MLP,5.508287,False,4.280427,True,14008,Main speciality of consultant (recoded) (0.2),Medical History,-0.149260,4.128440,0.000041,0.038972,4.391100,1.409251
14250,MLP,5.508287,True,4.280427,True,15621,Diagnoses - ICD10 (S761 - S76.1 Injury of quad...,Medical History,-0.103840,4.875812,0.000001,0.003698,5.934873,2.432091


In [15]:
df_passing_FDR.to_csv('Male_ALL_nIDPs_passing_FDR_ENSEMBLES.csv', index=False)

In [16]:
df_female_passing_FDR = pd.read_csv('../Analsysis Female New/Famale_ALL_nIDPs_passing_FDR_ENSEMBLES.csv')
df_passing_FDR['Sex'] = 'Male'
df_female_passing_FDR['Sex'] = 'Female'
first_column = df_passing_FDR.pop('Sex')
df_passing_FDR.insert(0, 'Sex', first_column)
first_column = df_female_passing_FDR.pop('Sex')
df_female_passing_FDR.insert(0, 'Sex', first_column)
df_ALL_passing_FDR = pd.concat([df_female_passing_FDR, df_passing_FDR])
df_ALL_passing_FDR.to_csv('MALE_FEMALE_ALL_nIDPs_passing_FDR_ENSEMBLES.csv', index=False)
df_ALL_passing_FDR

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_passing_FDR['Sex'] = 'Male'


Unnamed: 0,Sex,Modality,Bonferroni Threshold,Passes Bonf.,False Discovery Rate (FDR),Passes FDR,Caterogy ID,Variable Description,Variable Category,Correlation (r),T-Values,P-Values,FDR Adjusted P-Value,-log(P),FDR Adjusted -log(P)
0,Female,Naive Average,5.509606,False,3.300325,True,283,Age at menopause (last menstrual period) (2.0),Lifestyle,-0.084525,3.665329,0.000254,0.026482,3.595299,1.577055
1,Female,Naive Average,5.509606,False,3.300325,True,1172,Coffee intake (3.0),Diet,0.359461,3.694788,0.000373,0.036585,3.427788,1.436697
2,Female,Naive Average,5.509606,False,3.300325,True,1908,Crispbread intake (3.0),Diet,0.137881,4.041895,0.000058,0.007141,4.237532,2.146228
3,Female,Naive Average,5.509606,False,3.300325,True,1943,Number of crackers/crispbreads with butter/mar...,Diet,0.145734,3.571992,0.000383,0.037101,3.416475,1.430616
4,Female,Naive Average,5.509606,False,3.300325,True,2778,Alcohol intake frequency. (0.0),Alcohol,0.086238,4.334959,0.000015,0.002058,4.819479,2.686450
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10135,Male,MLP,5.508287,False,4.280427,True,11470,Diagnoses - secondary ICD10 (E119 - E11.9 With...,Medical History,0.086962,4.076691,0.000047,0.038972,4.324967,1.409251
12473,Male,MLP,5.508287,False,4.280427,True,13810,Operative procedures - secondary OPCS4 (Z573 -...,Medical History,-0.087190,4.087449,0.000045,0.038972,4.344923,1.409251
12651,Male,MLP,5.508287,False,4.280427,True,14008,Main speciality of consultant (recoded) (0.2),Medical History,-0.149260,4.128440,0.000041,0.038972,4.391100,1.409251
14250,Male,MLP,5.508287,True,4.280427,True,15621,Diagnoses - ICD10 (S761 - S76.1 Injury of quad...,Medical History,-0.103840,4.875812,0.000001,0.003698,5.934873,2.432091


In [17]:
df_female = pd.read_csv('../Analsysis Female New/Famale_ALL_nIDPs_ENSEMBLES.csv')
df['Sex'] = 'Male'
df_female['Sex'] = 'Female'
first_column = df.pop('Sex')
df.insert(0, 'Sex', first_column)
first_column = df_female.pop('Sex')
df_female.insert(0, 'Sex', first_column)
df_ALL = pd.concat([df_female, df])
df_ALL.to_csv('MALE_FEMALE_ALL_nIDPs_ENSEMBLES.csv', index=False)
df_ALL


Unnamed: 0,Sex,Modality,Bonferroni Threshold,Passes Bonf.,False Discovery Rate (FDR),Passes FDR,Caterogy ID,Variable Description,Variable Category,Correlation (r),T-Values,P-Values,FDR Adjusted P-Value,-log(P),FDR Adjusted -log(P)
0,Female,Naive Average,5.509606,False,3.300325,False,0,Ethnic background (0.0),Ethnic Background,-0.014876,0.744494,0.456647,1.000000,0.340419,4.821637e-17
1,Female,Naive Average,5.509606,False,3.300325,False,1,Ethnic background (1.0),Ethnic Background,-0.028788,0.595809,0.551618,1.000000,0.258362,4.821637e-17
2,Female,Naive Average,5.509606,False,3.300325,False,2,Ethnic background (2.0),Ethnic Background,-0.043779,1.176664,0.239718,0.972427,0.620300,1.214292e-02
3,Female,Naive Average,5.509606,False,3.300325,False,3,Genotype measurement batch (0.0),Genetic Markers,-0.059825,2.966517,0.003041,0.163519,2.516975,7.864327e-01
4,Female,Naive Average,5.509606,False,3.300325,False,4,Heterozygosity (0.0),Genetic Markers,-0.009512,0.470858,0.637784,1.000000,0.195326,4.821637e-17
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16111,Male,MLP,5.508287,False,4.280427,False,17518,Methods of self-harm used (0.1),Mental Health,-0.086027,0.579235,0.565319,1.000000,0.247706,-0.000000e+00
16112,Male,MLP,5.508287,False,4.280427,False,17521,Actions taken following self-harm (0.1),Mental Health,-0.351896,1.916932,0.066298,0.831489,1.178498,8.014342e-02
16113,Male,MLP,5.508287,False,4.280427,False,17522,Actions taken following self-harm (0.2),Mental Health,-0.406421,2.038398,0.054300,0.795836,1.265202,9.917649e-02
16114,Male,MLP,5.508287,False,4.280427,False,17525,Ever been offered/sought treatment for anxiety...,Mental Health,-0.031463,1.280621,0.200506,1.000000,0.697872,-0.000000e+00
