In [1]:
import sys
import pandas as pd
from os.path import join, exists

In [2]:
#read in csvs of interest one a time so you don't crash your computer
#grab the vars you want, then clear the rest and read in the next
#make one "missing" column for each modality if, like RSI, a subj is missing 
#on all vals if missing on one. double check this.
#also include qa column per modality and make missingness chart before/after data censoring

delta_dir = '/Volumes/projects_herting/LABDOCS/PROJECTS/ABCD/Data/pre_release4.0/Change_Scores/'
nonimg_dir = '/Volumes/projects_herting/LABDOCS/PROJECTS/ABCD/Data/pre_release4.0/tabulated/non_img'
img_dir = '/Volumes/projects_herting/LABDOCS/PROJECTS/ABCD/Data/pre_release4.0/tabulated/img/'

rsi_sc_path = join(delta_dir, 'RSI_gm/Subcortical_GM/rsi_subct_gm_changescores_bl_tp2.csv')
rsi_c_path = 'RSI_gm/Cortical_GM/'
rsi_wm_path = 'dMRI/RSI_wm/'
dti_wm_path = join(delta_dir, 'dMRI/DTI_wm/dmridtiwm_changescores_bl_tp2.csv')
rsfc_path = join(delta_dir, 'rsFMRI/rsfmri_changescores_bl_tp2.csv')
smri_path = join(delta_dir, 'sMRI/smri_changescores_bl_tp2.csv')
nihtb_path = join(delta_dir, 'NIH_Toolbox/nihtbx_changescores_bl_tp2.csv')

#hormones_path = join(nonimg_dir, 'abcd_hsss01.csv')
puberty_path = join(nonimg_dir, 'abcd_ssphp01.csv')
demo_path = join(nonimg_dir, 'pdem02.csv')
mri_path = join(img_dir, 'abcd_mri01.csv')
family_path = join(nonimg_dir, 'acspsw03.csv')
site_path = join(nonimg_dir, 'abcd_lt01.csv')

In [3]:
# column names to retain in the mega dataset
rsi_sc = []
rsi_c = [] 
rsi_wm = [] 
dti_wm = ['imgincl_dmri_include.baseline_year_1_arm_1', 'imgincl_dmri_include.2_year_follow_up_y_arm_1'] 
rsfc = ['imgincl_rsfmri_include.baseline_year_1_arm_1', 'imgincl_rsfmri_include.2_year_follow_up_y_arm_1',
         'rsfmri_c_ngd_meanmotion.baseline_year_1_arm_1', 'rsfmri_c_ngd_meanmotion.2_year_follow_up_y_arm_1', 
         'rsfmri_c_ngd_ntpoints.baseline_year_1_arm_1', 'rsfmri_c_ngd_ntpoints.2_year_follow_up_y_arm_1'] 
smri = ['imgincl_t1w_include.baseline_year_1_arm_1', 'imgincl_t1w_include.2_year_follow_up_y_arm_1',
        'imgincl_t2w_include.baseline_year_1_arm_1', 'imgincl_t2w_include.2_year_follow_up_y_arm_1', 
        'interview_age.baseline_year_1_arm_1', 'interview_age.2_year_follow_up_y_arm_1'] 
nihtb = ['nihtbx_picvocab_uncorrected.change_score',
         'nihtbx_flanker_uncorrected.change_score',
         'nihtbx_list_uncorrected.change_score',
         'nihtbx_cardsort_uncorrected.change_score',
         'nihtbx_pattern_uncorrected.change_score',
         'nihtbx_picture_uncorrected.change_score',
         'nihtbx_reading_uncorrected.change_score']
#hormones = ['hormone_scr_hse_mean', 'hormone_scr_ert_mean', 'hormone_scr_dhea_mean']
puberty = ['sex', 
           'pds_p_ss_male_category', 'pds_p_ss_female_category', 
           'pds_p_ss_male_category_2', 'pds_p_ss_female_category_2']
demographics = ['demo_prnt_ethn_v2', 'demo_prnt_marital_v2', 'demo_prnt_ed_v2', 'demo_comb_income_v2', 
                'demo_race_a_p___10', 'demo_race_a_p___11', 'demo_race_a_p___12', 'demo_race_a_p___13',
                'demo_race_a_p___14', 'demo_race_a_p___15', 'demo_race_a_p___16', 'demo_race_a_p___17',
                'demo_race_a_p___18', 'demo_race_a_p___19', 'demo_race_a_p___20', 'demo_race_a_p___21',
                'demo_race_a_p___22', 'demo_race_a_p___23', 'demo_race_a_p___24', 'demo_race_a_p___25']
mri = ['mri_info_manufacturer', 'interview_age']
family = ['rel_family_id',
          'rel_group_id',
          'rel_ingroup_order',
          'rel_relationship']
site = ['site_id_l']

# building a dictionary of column names
variables = {rsi_sc_path: rsi_sc,
             rsi_c_path: rsi_c,
             rsi_wm_path: rsi_wm,
             dti_wm_path: dti_wm, 
             rsfc_path: rsfc, 
             smri_path: smri, 
             nihtb_path: nihtb,
             #hormones_path: hormones,
             #puberty_path: puberty,
             demo_path: demographics,
             #mri_path: mri
             family_path: family,
             #site_path: site
            }

# the regular expressions for parsing each change score data structure
regex = {rsi_sc_path: 'dmri_(rsin0_|rsind_).*(change_score|baseline.*)', 
         rsi_c_path: 'dmri_(rsin0_|rsind_).*(change_score|baseline.*)', 
         rsi_wm_path: 'dmri_(rsin0_|rsind_).*(change_score|baseline.*)', 
         dti_wm_path: 'dmri_dti(fa|md).*(change_score|baseline.*)', 
         rsfc_path: 'rsfmri_(var|cor).*(change_score|baseline.*)', 
         smri_path: 'smri_(thick|area|t1wcnt).*(change_score|baseline.*)', 
         #nihtb_path: 'nihtbx_.*uncorrected.change_score'
        }

In [4]:
# using regex grab column names for all imaging and change score variables
for key in regex.keys():
    print(key)
    if exists(key):
        print('reading...')
        df = pd.read_csv(key, index_col='subjectkey', header=0, nrows=3)
        print('regexing...')
        small = df.filter(regex=regex[key], axis=1)
        variables[key] += list(small.columns)
    else:
        print('path does not exist')
print('done!')

/Volumes/projects_herting/LABDOCS/PROJECTS/ABCD/Data/pre_release4.0/Change_Scores/RSI_gm/Subcortical_GM/rsi_subct_gm_changescores_bl_tp2.csv
reading...
regexing...
RSI_gm/Cortical_GM/
path does not exist
dMRI/RSI_wm/
path does not exist
/Volumes/projects_herting/LABDOCS/PROJECTS/ABCD/Data/pre_release4.0/Change_Scores/dMRI/DTI_wm/dmridtiwm_changescores_bl_tp2.csv
reading...
regexing...
/Volumes/projects_herting/LABDOCS/PROJECTS/ABCD/Data/pre_release4.0/Change_Scores/rsFMRI/rsfmri_changescores_bl_tp2.csv
reading...
regexing...
/Volumes/projects_herting/LABDOCS/PROJECTS/ABCD/Data/pre_release4.0/Change_Scores/sMRI/smri_changescores_bl_tp2.csv
reading...
regexing...
done!


In [5]:
#build the mega_df now
df = pd.DataFrame()
for key in variables.keys():
    print(key)
    old_columns = len(df.columns)
    if exists(key):
        temp_df = pd.read_csv(key, index_col='subjectkey', header=0)
        if 'acspsw03' in key:
            temp_df = temp_df[temp_df['eventname'] == 'baseline_year_1_arm_1']
        elif 'abcd_ssphp01' in key:
            temp_df = temp_df[temp_df['eventname'] == 'baseline_year_1_arm_1']
        elif 'abcd_lt01' in key:
            temp_df = temp_df[temp_df['eventname'] == 'baseline_year_1_arm_1']
        else: 
            pass
        df = pd.concat([df, temp_df.filter(variables[key])], axis=1, sort=True)
    else:
        print(f'\tDoes not exist.')
    new_columns = len(df.columns) - old_columns
    if new_columns == len(variables[key]):
        print(f'\tAll {len(variables[key])} variables added!')
    else:
        print(f'\t{len(variables[key]) - new_columns} variables missing from data structure.')
    #if 0 < len(variables[key]) < 21:
        #print(f'\t{df.iloc[2][variables[key][0]]}')

/Volumes/projects_herting/LABDOCS/PROJECTS/ABCD/Data/pre_release4.0/Change_Scores/RSI_gm/Subcortical_GM/rsi_subct_gm_changescores_bl_tp2.csv
	All 68 variables added!
RSI_gm/Cortical_GM/
	Does not exist.
	All 0 variables added!
dMRI/RSI_wm/
	Does not exist.
	All 0 variables added!
/Volumes/projects_herting/LABDOCS/PROJECTS/ABCD/Data/pre_release4.0/Change_Scores/dMRI/DTI_wm/dmridtiwm_changescores_bl_tp2.csv
	All 858 variables added!
/Volumes/projects_herting/LABDOCS/PROJECTS/ABCD/Data/pre_release4.0/Change_Scores/rsFMRI/rsfmri_changescores_bl_tp2.csv
	All 1428 variables added!
/Volumes/projects_herting/LABDOCS/PROJECTS/ABCD/Data/pre_release4.0/Change_Scores/sMRI/smri_changescores_bl_tp2.csv
	All 648 variables added!
/Volumes/projects_herting/LABDOCS/PROJECTS/ABCD/Data/pre_release4.0/Change_Scores/NIH_Toolbox/nihtbx_changescores_bl_tp2.csv
	All 7 variables added!
/Volumes/projects_herting/LABDOCS/PROJECTS/ABCD/Data/pre_release4.0/tabulated/non_img/pdem02.csv
	All 20 variables added!
/Volu

In [6]:
mri_df = pd.read_csv(mri_path, index_col=['subjectkey', 'eventname'], header=0)
site_df = pd.read_csv(site_path, index_col=['subjectkey', 'eventname'], header=0)
puberty_df = pd.read_csv(puberty_path, index_col=['subjectkey', 'eventname'], header=0)
#mri_df = mri_df.loc[mri_df.sort_values('eventname').index.duplicated(keep='first')]

In [7]:
skipped = 0
for i in df.index:
    try:
        df.loc[i, 'mri_info_manufacturer.baseline_year_1_arm_1'] = mri_df.loc[(i, 
                                                                               'baseline_year_1_arm_1'), 
                                                                              'mri_info_manufacturer']
        df.loc[i, 'mri_info_manufacturer.2_year_follow_up_y_arm_1'] = mri_df.loc[(i, 
                                                                                  '2_year_follow_up_y_arm_1'), 
                                                                                 'mri_info_manufacturer']
        df.loc[i, 'site_id_l.baseline_year_1_arm_1'] = site_df.loc[(i, 
                                                                    'baseline_year_1_arm_1'), 
                                                                   'site_id_l']
        df.loc[i, 'site_id_l.2_year_follow_up_y_arm_1'] = site_df.loc[(i, 
                                                                       '2_year_follow_up_y_arm_1'), 
                                                                      'site_id_l']
        for var in puberty:
            df.loc[i, f'{var}.baseline_year_1_arm_1'] = puberty_df.loc[(i, 
                                                                        'baseline_year_1_arm_1'),
                                                                       var]
            df.loc[i, f'{var}.2_year_follow_up_y_arm_1'] = puberty_df.loc[(i,
                                                                           '2_year_follow_up_y_arm_1'),
                                                                          var]
        #print('{0} success'.format(i))
    except:
        skipped += 1

In [8]:
print(f'Full dataframe is {sys.getsizeof(df) / 1000000000}GB.')

Full dataframe is 0.295583124GB.


In [9]:
df.to_csv('/Volumes/projects_herting/LABDOCS/Personnel/Katie/deltaABCD_clustering/data/data.csv')