In [1]:
from scipy import stats
import pandas as pd
import numpy as np

All_Biomarkers = sorted(['HandGripStrength', 'BrainGreyMatterVolumes', 'BrainSubcorticalVolumes',
              'HeartSize', 'HeartPWA', 'ECGAtRest', 'AnthropometryImpedance',
              'UrineBiochemistry', 'BloodBiochemistry', 'BloodCount',
              'EyeAutorefraction', 'EyeAcuity', 'EyeIntraocularPressure',
              'BraindMRIWeightedMeans', 'Spirometry', 'BloodPressure',
              'AnthropometryBodySize', 'ArterialStiffness', 'CarotidUltrasound',
              'BoneDensitometryOfHeel', 'HearingTest', 'CognitiveFluidIntelligence',
              'CognitiveMatrixPatternCompletion', 'CognitiveNumericMemory', 'CognitivePairedAssociativeLearning',
              'CognitivePairsMatching', 'CognitiveProspectiveMemory', 'CognitiveReactionTime',
              'CognitiveSymbolDigitSubstitution', 'CognitiveTowerRearranging', 'CognitiveTrailMaking', 'PhysicalActivity'])
All_Environmental = sorted(["Alcohol", "Diet", "EarlyLifeFactors", "ElectronicDevices", "Medication", "SunExposure", "Smoking"])
All_Socioeconomics = sorted(["Education", "Employment", "Household", "SocialSupport", "OtherSociodemographics"])
All_Phenotypes = sorted(["Breathing", "CancerScreening", "ChestPain", "Claudication", "Eyesight", "GeneralHealth", "GeneralPain", "Hearing", "MentalHealth", "Mouth", "SexualFactors", "Sleep"])
All_Diseases = ['medical_diagnoses_%s' % letter for letter in ['A', 'B', 'C', 'D', 'E',
                                                    'F', 'G', 'H', 'I', 'J',
                                                    'K', 'L', 'M', 'N', 'O',
                                                    'P', 'Q', 'R', 'S', 'T',
                                                    'U', 'V', 'W', 'X', 'Y', 'Z']]
All = sorted(All_Biomarkers + All_Environmental + All_Socioeconomics + All_Phenotypes + All_Diseases + ["FamilyHistory"])


organs = ['_instances01',
       '_instances1.5x', '_instances23', 'Abdomen', 'AbdomenLiver',
       'AbdomenPancreas', 'Arterial', 'ArterialPulseWaveAnalysis',
       'ArterialCarotids', 'Biochemistry', 'BiochemistryUrine',
       'BiochemistryBlood', 'Brain', 'BrainCognitive', 'BrainMRI', 'Eyes',
       'EyesAll', 'EyesFundus', 'EyesOCT', 'Hearing', 'Heart', 'HeartECG',
       'HeartMRI', 'ImmuneSystem', 'Lungs', 'Musculoskeletal',
       'MusculoskeletalSpine', 'MusculoskeletalHips', 'MusculoskeletalKnees',
       'MusculoskeletalFullBody', 'MusculoskeletalScalars',
       'PhysicalActivity']  # \\* is process after

list_df = []
for env_dataset in All:
    if env_dataset == "Claudication":
        env_dataset = "Claudification"
    for organ in organs:
        try :
            df = pd.read_csv('../linear_output_paper/linear_correlations_%s_%s.csv' % (env_dataset, organ))
            if env_dataset == "Claudification":
                env_dataset = "Claudication"
            df['env_dataset'] = env_dataset
            if env_dataset == "Claudication":
                env_dataset = "Claudification"
            list_df.append(df)
        except FileNotFoundError:
            print(env_dataset)
            print(organ)
            continue

for env_dataset in All:
    if env_dataset == "Claudication":
        env_dataset = "Claudification"
    try :
        df = pd.read_csv('../linear_output_paper/linear_correlations_%s_/_.csv' % (env_dataset))
        if env_dataset == "Claudification":
            env_dataset = "Claudication"
        df['env_dataset'] = env_dataset
        list_df.append(df)
    except FileNotFoundError:
        print(env_dataset)
        print("exception_organ")
        continue

final_df = pd.concat(list_df, ignore_index=True)
final_df.drop(index=final_df[final_df.size_na_dropped < 10].index, inplace=True)

In [2]:
def Create_data(corr_type, method):
    df_corr_env = pd.DataFrame(columns = ['env_dataset', 'organ_1', 'organ_2', 'corr', 'sample_size'])

    env_dataset_names = final_df.env_dataset.drop_duplicates().tolist() + ["All", "All_Biomarkers", "All_Environmental", "All_Socioeconomics", "All_Phenotypes", "All_Diseases"]

    for env_dataset_name in env_dataset_names:
        if env_dataset_name == "All":
            env_dataset = All
        elif env_dataset_name == "All_Biomarkers":
            env_dataset = All_Biomarkers        
        elif env_dataset_name == "All_Environmental":
            env_dataset = All_Environmental
        elif env_dataset_name == "All_Socioeconomics":
            env_dataset = All_Socioeconomics
        elif env_dataset_name == "All_Phenotypes":
            env_dataset = All_Phenotypes
        elif env_dataset_name == "All_Diseases":
            env_dataset = All_Diseases
        else:
            env_dataset = [env_dataset_name]

        df_env = final_df[final_df.env_dataset.isin(env_dataset)]

        for organ_1 in df_env.target_dataset_name.drop_duplicates():
            df_1 = df_env[df_env.target_dataset_name == organ_1].fillna(0)
            n_features_1 = df_1.shape[0]
            significative_df_1 = df_1[df_1['p_val'] < 0.05/n_features_1]
            features_significative_1 = pd.Index(significative_df_1.env_feature_name)

            for organ_2 in df_env.target_dataset_name.drop_duplicates():
                df_2 = df_env[df_env.target_dataset_name == organ_2].fillna(0)
                n_features_2 = df_2.shape[0]
                significative_df_2 = df_2[df_2['p_val'] < 0.05/n_features_2]
                features_significative_2 = pd.Index(significative_df_2.env_feature_name)

                if method == 'Union':
                    significative_features = features_significative_1.union(features_significative_2)
                    df_1_sign = df_1[df_1.env_feature_name.isin(significative_features)].corr_value
                    df_2_sign = df_2[df_2.env_feature_name.isin(significative_features)].corr_value
                elif method == 'Intersection':
                    significative_features = features_significative_1.intersection(features_significative_2)
                    df_1_sign = df_1[df_1.env_feature_name.isin(significative_features)].corr_value
                    df_2_sign = df_2[df_2.env_feature_name.isin(significative_features)].corr_value
                else :
                    df_1_sign = df_1.corr_value
                    df_2_sign = df_2.corr_value
                if len(df_1_sign) <= 1 : 
                    corr = np.nan
                    sample_size = 1
                else :
                    try : 
                        if corr_type == 'Spearman':
                            corr, _ = stats.spearmanr(df_1_sign, df_2_sign)
                        elif corr_type == 'Pearson':
                            corr, _ = stats.pearsonr(df_1_sign, df_2_sign)
                        assert(len(df_1_sign) == len(df_2_sign))
                        sample_size = len(df_1_sign)
                    except ValueError:
                        corr = 0
                df_corr_env = df_corr_env.append({'env_dataset' : env_dataset_name, 'organ_1' : organ_1, 'organ_2' :organ_2, 'corr' :corr, 'sample_size' : sample_size}, ignore_index = True)
    df_corr_env.to_csv('../data/page6_LinearXWASCorrelations/CorrelationsLinear/Correlations_%s_%s.csv'% (method, corr_type))


for method in ['All', 'Union', 'Intersection']:
    for corr_type in ['Pearson', 'Spearman']:
        print(method, corr_type)
        Create_data(corr_type, method)

array is constant; the correlation coefficent is not defined.


PermissionError: [Errno 13] Permission denied: '../data/page6_LinearXWASCorrelations/CorrelationsLinear/Correlations_All_Pearson.csv'