In [1]:
import pandas as pd
import numpy as np
import scipy.stats as sstats 
from statsmodels.stats import contingency_tables
from os.path import exists, join

In [2]:
PROJ_DIR = '/Volumes/Projects_Herting/LABDOCS/Personnel/Katie/deltaABCD_clustering'
DATA_DIR = 'data'
FIG_DIR = 'figures'

In [3]:
df = pd.read_csv(join(PROJ_DIR, DATA_DIR, 'data.csv'), 
                 header=0, 
                 index_col='subjectkey')

In [4]:
demographics = ["demo_prnt_ethn_v2",
                "demo_prnt_marital_v2",
                "demo_prnt_ed_v2",
                "demo_comb_income_v2",
                "demo_race_a_p___10",
                "demo_race_a_p___11",
                "demo_race_a_p___12",
                "demo_race_a_p___13",
                "demo_race_a_p___14",
                "demo_race_a_p___15",
                "demo_race_a_p___16",
                "demo_race_a_p___17",
                "demo_race_a_p___18",
                "demo_race_a_p___19",
                "demo_race_a_p___20",
                "demo_race_a_p___21",
                "demo_race_a_p___22",
                "demo_race_a_p___23",
                "demo_race_a_p___24",
                "demo_race_a_p___25",
                "site_id_l",
                "sex", 
                "mri_info_manufacturer"
               ]
mri_qc = [
    "imgincl_dmri_include",
    "imgincl_rsfmri_include",
    "imgincl_t1w_include",
    "imgincl_t2w_include",
    "interview_age",
    "interview_date"
]

In [5]:
demo_and_qc = []
for var in demographics + mri_qc:
    demo_and_qc.append(f'{var}.baseline_year_1_arm_1')
    if var in mri_qc:
        demo_and_qc.append(f'{var}.2_year_follow_up_y_arm_1')
    else:
        pass

In [6]:
demo_and_qc

['demo_prnt_ethn_v2.baseline_year_1_arm_1',
 'demo_prnt_marital_v2.baseline_year_1_arm_1',
 'demo_prnt_ed_v2.baseline_year_1_arm_1',
 'demo_comb_income_v2.baseline_year_1_arm_1',
 'demo_race_a_p___10.baseline_year_1_arm_1',
 'demo_race_a_p___11.baseline_year_1_arm_1',
 'demo_race_a_p___12.baseline_year_1_arm_1',
 'demo_race_a_p___13.baseline_year_1_arm_1',
 'demo_race_a_p___14.baseline_year_1_arm_1',
 'demo_race_a_p___15.baseline_year_1_arm_1',
 'demo_race_a_p___16.baseline_year_1_arm_1',
 'demo_race_a_p___17.baseline_year_1_arm_1',
 'demo_race_a_p___18.baseline_year_1_arm_1',
 'demo_race_a_p___19.baseline_year_1_arm_1',
 'demo_race_a_p___20.baseline_year_1_arm_1',
 'demo_race_a_p___21.baseline_year_1_arm_1',
 'demo_race_a_p___22.baseline_year_1_arm_1',
 'demo_race_a_p___23.baseline_year_1_arm_1',
 'demo_race_a_p___24.baseline_year_1_arm_1',
 'demo_race_a_p___25.baseline_year_1_arm_1',
 'site_id_l.baseline_year_1_arm_1',
 'sex.baseline_year_1_arm_1',
 'mri_info_manufacturer.baseline_ye

In [7]:
demo_df = df[demo_and_qc]

#site_baseline = pd.get_dummies(demo_df, 'site_id_l.baseline_year_1_arm_1')
#site_2yfu = pd.get_dummies(demo_df, 'site_id_l.2_year_follow_up_y_arm_1')

#demo_df = pd.concat([demo_df, site_baseline, site_2yfu], axis=1)

In [8]:
df = None

In [9]:
no_2yfu = demo_df[demo_df["interview_date.2_year_follow_up_y_arm_1"].isna() == True].index
lost_N = len(no_2yfu)
total_N = len(demo_df.index)

y2fu_df = demo_df.drop(no_2yfu, axis=0)

In [10]:
print(f"Of the total {total_N} participants at baseline, {lost_N} (or {np.round((lost_N / total_N) *100, 2)}%) did not have a 2-year follow-up imaging appointment and were, thus, excluded from further analyses.")

Of the total 11760 participants at baseline, 3958 (or 33.66%) did not have a 2-year follow-up imaging appointment and were, thus, excluded from further analyses.


In [11]:
table = pd.DataFrame(index=['N', 
                            'Age_mean_base',
                            'Age_sdev_base',
                            'Age_mean_2yfu',
                            'Age_sdev_2yfu',
                            'Sex_M', 
                            'Sex_F', 
                            'Ethnicity_H', 
                            'Ethnicity_NH', 
                            'Ethnicity_refuse', 
                            'Race_White', 
                            'Race_Black', 
                            'Race_NativeAmer', 
                            'Race_AsianPac', 
                            'Race_Other', 
                            'Income_gt100k', 
                            'Income_50to100k', 
                            'Income_lt50k',
                            'Income_dkrefuse', 
                            'MRI_Siemens', 
                            'MRI_GE', 
                            'MRI_Phillips'], 
                     columns=['whole_sample', 'with_2yfu'])

table.at['N', 'whole_sample'] = len(demo_df.index)
table.at['N', 'with_2yfu'] = len(y2fu_df.index)

table.at['Age_mean_base', 'whole_sample'] = np.mean(demo_df['interview_age.baseline_year_1_arm_1'])
table.at['Age_mean_base', 'with_2yfu'] = np.mean(y2fu_df['interview_age.baseline_year_1_arm_1'])

table.at['Age_mean_2yfu', 'whole_sample'] = np.mean(demo_df['interview_age.2_year_follow_up_y_arm_1'])
table.at['Age_mean_2yfu', 'with_2yfu'] = np.mean(y2fu_df['interview_age.2_year_follow_up_y_arm_1'])

table.at['Age_sdev_base', 'whole_sample'] = np.std(demo_df['interview_age.baseline_year_1_arm_1'])
table.at['Age_sdev_base', 'with_2yfu'] = np.std(y2fu_df['interview_age.baseline_year_1_arm_1'])

table.at['Age_sdev_2yfu', 'whole_sample'] = np.std(demo_df['interview_age.2_year_follow_up_y_arm_1'])
table.at['Age_sdev_2yfu', 'with_2yfu'] = np.std(y2fu_df['interview_age.2_year_follow_up_y_arm_1'])

table.at['Sex_M', 'whole_sample'] = len(demo_df[demo_df['sex.baseline_year_1_arm_1'] == 'M'].index)
table.at['Sex_M', 'with_2yfu'] = len(y2fu_df[y2fu_df['sex.baseline_year_1_arm_1'] == 'M'].index)
table.at['Sex_F', 'whole_sample'] = len(demo_df[demo_df['sex.baseline_year_1_arm_1'] == 'F'].index)
table.at['Sex_F', 'with_2yfu'] = len(y2fu_df[y2fu_df['sex.baseline_year_1_arm_1'] == 'F'].index)

table.at['Ethnicity_H', 
         'whole_sample'] = len(demo_df[demo_df['demo_prnt_ethn_v2.baseline_year_1_arm_1'] == 1.].index)
table.at['Ethnicity_H', 
         'with_2yfu'] = len(y2fu_df[y2fu_df['demo_prnt_ethn_v2.baseline_year_1_arm_1'] == 1.].index)
table.at['Ethnicity_NH', 
         'whole_sample'] = len(demo_df[demo_df['demo_prnt_ethn_v2.baseline_year_1_arm_1'] == 2.].index)
table.at['Ethnicity_NH', 
         'with_2yfu'] = len(y2fu_df[y2fu_df['demo_prnt_ethn_v2.baseline_year_1_arm_1'] == 2.].index)
table.at['Ethnicity_refuse', 
         'whole_sample'] = len(demo_df[demo_df['demo_prnt_ethn_v2.baseline_year_1_arm_1'] == 777.].index)
table.at['Ethnicity_refuse', 
         'with_2yfu'] = len(y2fu_df[y2fu_df['demo_prnt_ethn_v2.baseline_year_1_arm_1'] == 777.].index)

table.at['Race_White', 
         'whole_sample'] = len(demo_df[demo_df['demo_race_a_p___10.baseline_year_1_arm_1'] == 1.].index)
table.at['Race_White', 
         'with_2yfu'] = len(y2fu_df[y2fu_df['demo_race_a_p___10.baseline_year_1_arm_1'] == 1.].index)
table.at['Race_Black', 
         'whole_sample'] = len(demo_df[demo_df['demo_race_a_p___11.baseline_year_1_arm_1'] == 1.].index)
table.at['Race_Black', 
         'with_2yfu'] = len(y2fu_df[y2fu_df['demo_race_a_p___11.baseline_year_1_arm_1'] == 1.].index)

demo_df['Race_NativeAmer'] = demo_df['demo_race_a_p___12.baseline_year_1_arm_1'] + demo_df['demo_race_a_p___13.baseline_year_1_arm_1'] + demo_df['demo_race_a_p___14.baseline_year_1_arm_1']
y2fu_df['Race_NativeAmer'] = y2fu_df['demo_race_a_p___12.baseline_year_1_arm_1'] + y2fu_df['demo_race_a_p___13.baseline_year_1_arm_1'] + y2fu_df['demo_race_a_p___14.baseline_year_1_arm_1']

demo_df['Race_AsianPac'] = demo_df['demo_race_a_p___15.baseline_year_1_arm_1'] + demo_df['demo_race_a_p___16.baseline_year_1_arm_1'] + demo_df['demo_race_a_p___17.baseline_year_1_arm_1'] + demo_df['demo_race_a_p___18.baseline_year_1_arm_1'] + demo_df['demo_race_a_p___19.baseline_year_1_arm_1'] + demo_df['demo_race_a_p___20.baseline_year_1_arm_1'] + demo_df['demo_race_a_p___21.baseline_year_1_arm_1'] + demo_df['demo_race_a_p___22.baseline_year_1_arm_1'] + demo_df['demo_race_a_p___23.baseline_year_1_arm_1'] + demo_df['demo_race_a_p___24.baseline_year_1_arm_1']
y2fu_df['Race_AsianPac'] = y2fu_df['demo_race_a_p___15.baseline_year_1_arm_1'] + y2fu_df['demo_race_a_p___16.baseline_year_1_arm_1'] + y2fu_df['demo_race_a_p___17.baseline_year_1_arm_1'] + y2fu_df['demo_race_a_p___18.baseline_year_1_arm_1'] + y2fu_df['demo_race_a_p___19.baseline_year_1_arm_1'] + y2fu_df['demo_race_a_p___20.baseline_year_1_arm_1'] + y2fu_df['demo_race_a_p___21.baseline_year_1_arm_1'] + y2fu_df['demo_race_a_p___22.baseline_year_1_arm_1'] + y2fu_df['demo_race_a_p___23.baseline_year_1_arm_1'] + y2fu_df['demo_race_a_p___24.baseline_year_1_arm_1']

table.at['Race_NativeAmer', 
         'whole_sample'] = len(demo_df[demo_df['Race_NativeAmer'] > 0.].index)
table.at['Race_NativeAmer', 
         'with_2yfu'] = len(y2fu_df[y2fu_df['Race_NativeAmer'] > 0.].index)
table.at['Race_AsianPac', 
         'whole_sample'] = len(demo_df[demo_df['Race_AsianPac'] > 0.].index)
table.at['Race_AsianPac', 
         'with_2yfu'] = len(y2fu_df[y2fu_df['Race_AsianPac'] > 0.].index)

table.at['Race_Other', 
         'whole_sample'] = len(demo_df[demo_df['demo_race_a_p___25.baseline_year_1_arm_1'] == 1.].index)
table.at['Race_Other', 
         'with_2yfu'] = len(y2fu_df[y2fu_df['demo_race_a_p___25.baseline_year_1_arm_1'] == 1.].index)

table.at['Income_gt100k', 
         'whole_sample'] = len(demo_df[demo_df['demo_comb_income_v2.baseline_year_1_arm_1'] == 9.].index) + len(demo_df[demo_df['demo_comb_income_v2.baseline_year_1_arm_1'] == 10.].index)
table.at['Income_gt100k', 
         'with_2yfu'] = len(y2fu_df[y2fu_df['demo_comb_income_v2.baseline_year_1_arm_1'] == 9.].index) + len(y2fu_df[y2fu_df['demo_comb_income_v2.baseline_year_1_arm_1'] == 10.].index)

table.at['Income_50to100k', 
         'whole_sample'] = len(demo_df['demo_comb_income_v2.baseline_year_1_arm_1'].between(6., 9.).index)
table.at['Income_50to100k', 
         'with_2yfu'] = len(y2fu_df['demo_comb_income_v2.baseline_year_1_arm_1'].between(6., 9.).index)

table.at['Income_lt50k', 
         'whole_sample'] = len(demo_df[demo_df['demo_comb_income_v2.baseline_year_1_arm_1'] <= 6.].index)
table.at['Income_lt50k', 
         'with_2yfu'] = len(y2fu_df[y2fu_df['demo_comb_income_v2.baseline_year_1_arm_1'] <= 6.].index)

table.at['Income_dkrefuse', 
         'whole_sample'] = len(demo_df[demo_df['demo_comb_income_v2.baseline_year_1_arm_1'] >= 777.].index)
table.at['Income_dkrefuse', 
         'with_2yfu'] = len(y2fu_df[y2fu_df['demo_comb_income_v2.baseline_year_1_arm_1'] >= 777.].index)

table.at['MRI_Siemens', 
         'whole_sample'] = len(demo_df[demo_df['mri_info_manufacturer.baseline_year_1_arm_1'] == "SIEMENS"].index)
table.at['MRI_Siemens', 
         'with_2yfu'] = len(y2fu_df[y2fu_df['mri_info_manufacturer.baseline_year_1_arm_1'] == "SIEMENS"].index)
table.at['MRI_GE', 
         'whole_sample'] = len(demo_df[demo_df['mri_info_manufacturer.baseline_year_1_arm_1'] == "GE MEDICAL SYSTEMS"].index)
table.at['MRI_GE', 
         'with_2yfu'] = len(y2fu_df[y2fu_df['mri_info_manufacturer.baseline_year_1_arm_1']  == "GE MEDICAL SYSTEMS"].index)
table.at['MRI_Philips', 
         'whole_sample'] = len(demo_df[demo_df['mri_info_manufacturer.baseline_year_1_arm_1'] == "Philips Medical Systems"].index)
table.at['MRI_Philips', 
         'with_2yfu'] = len(y2fu_df[y2fu_df['mri_info_manufacturer.baseline_year_1_arm_1'] == "Philips Medical Systems"].index)

In [13]:
# test for differences in means with wilcoxon signed rank test
stat, pval = sstats.mannwhitneyu(demo_df['interview_age.baseline_year_1_arm_1'], 
                                 y2fu_df['interview_age.baseline_year_1_arm_1'])
table.at['Age_mean_base', 'Stat'] = stat
table.at['Age_mean_base', 'p(Stat)'] = pval

stat, pval = sstats.mannwhitneyu(demo_df['interview_age.2_year_follow_up_y_arm_1'].dropna(), 
                                 y2fu_df['interview_age.2_year_follow_up_y_arm_1'].dropna())
table.at['Age_mean_2yfu', 'Stat'] = stat
table.at['Age_mean_2yfu', 'p(Stat)'] = pval

contingency = np.zeros((2,2))
contingency[0,0] = table.loc['Sex_M', 'whole_sample']
contingency[0,1] = table.loc['Sex_F', 'whole_sample']
contingency[1,0] = table.loc['Sex_M', 'with_2yfu']
contingency[1,1] = table.loc['Sex_F', 'with_2yfu']
out = contingency_tables.mcnemar(contingency) 
table.at['Sex_M', 'Stat'] = out.statistic
table.at['Sex_M', 'p(Stat)'] = out.pvalue

contingency = np.zeros((2,3))
contingency[0,0] = table.loc['Ethnicity_H', 'whole_sample']
contingency[0,1] = table.loc['Ethnicity_NH', 'whole_sample']
contingency[0,2] = table.loc['Ethnicity_refuse', 'whole_sample']
contingency[1,0] = table.loc['Ethnicity_H', 'with_2yfu']
contingency[1,1] = table.loc['Ethnicity_NH', 'with_2yfu']
contingency[1,2] = table.loc['Ethnicity_refuse', 'with_2yfu']
out = contingency_tables.cochrans_q(contingency) 
table.at['Ethnicity_H', 'Stat'] = out.statistic
table.at['Ethnicity_H', 'p(Stat)'] = out.pvalue

contingency = np.zeros((2,5))
contingency[0,0] = table.loc['Race_White', 'whole_sample']
contingency[0,1] = table.loc['Race_Black', 'whole_sample']
contingency[0,2] = table.loc['Race_NativeAmer', 'whole_sample']
contingency[0,3] = table.loc['Race_AsianPac', 'whole_sample']
contingency[0,4] = table.loc['Race_Other', 'whole_sample']
contingency[1,0] = table.loc['Race_White', 'with_2yfu']
contingency[1,1] = table.loc['Race_Black', 'with_2yfu']
contingency[1,2] = table.loc['Race_NativeAmer', 'with_2yfu']
contingency[1,3] = table.loc['Race_AsianPac', 'with_2yfu']
contingency[1,4] = table.loc['Race_Other', 'with_2yfu']
out = contingency_tables.cochrans_q(contingency) 
table.at['Race_White', 'Stat'] = out.statistic
table.at['Race_White', 'p(Stat)'] = out.pvalue

contingency = np.zeros((2,4))
contingency[0,0] = table.loc['Income_gt100k', 'whole_sample']
contingency[0,1] = table.loc['Income_50to100k', 'whole_sample']
contingency[0,2] = table.loc['Income_lt50k', 'whole_sample']
contingency[0,3] = table.loc['Income_dkrefuse', 'whole_sample']
contingency[1,0] = table.loc['Income_gt100k', 'with_2yfu']
contingency[1,1] = table.loc['Income_50to100k', 'with_2yfu']
contingency[1,2] = table.loc['Income_lt50k', 'with_2yfu']
contingency[1,3] = table.loc['Income_dkrefuse', 'with_2yfu']
out = contingency_tables.cochrans_q(contingency) 
table.at['Income_gt100k', 'Stat'] = out.statistic
table.at['Income_gt100k', 'p(Stat)'] = out.pvalue

contingency = np.zeros((2,3))
contingency[0,0] = table.loc['MRI_Siemens', 'whole_sample']
contingency[0,1] = table.loc['MRI_GE', 'whole_sample']
contingency[0,2] = table.loc['MRI_Philips', 'whole_sample']
contingency[1,0] = table.loc['MRI_Siemens', 'with_2yfu']
contingency[1,1] = table.loc['MRI_GE', 'with_2yfu']
contingency[1,2] = table.loc['MRI_Philips', 'with_2yfu']
out = contingency_tables.cochrans_q(contingency) 
table.at['MRI_Siemens', 'Stat'] = out.statistic
table.at['MRI_Siemens', 'p(Stat)'] = out.pvalue

In [14]:
table

Unnamed: 0,whole_sample,with_2yfu,Stat,p(Stat)
N,11760.0,7802.0,,
Age_mean_base,118.974,118.742,46695086.5,0.03396358
Age_sdev_base,7.49569,7.43653,,
Age_mean_2yfu,143.218,143.218,30233088.0,1.0
Age_sdev_2yfu,7.76211,7.76211,,
Sex_M,6146.0,4206.0,4206.0,6.623282e-46
Sex_F,5614.0,3596.0,,
Ethnicity_H,2023.0,1287.0,2.0,0.3678794
Ethnicity_NH,9666.0,6483.0,,
Ethnicity_refuse,23.0,11.0,,


In [16]:
table.loc['Sex_F', 'whole_sample'] / table.loc['N', 'whole_sample']

0.4773809523809524

In [17]:
table.loc['Sex_F', 'with_2yfu'] / table.loc['N', 'with_2yfu']

0.460907459625737