### Notebook to combine and subset MRI features for BayesDB analysis.

In [1]:
import pandas as pd

In [2]:
mri = pd.read_csv(
    "train_test_data/all_MRI_features.csv"
) # MRI features
qdf = pd.read_csv(
    "train_test_data/drop_aggregates.csv"
)
qdf=qdf.drop_duplicates("EID") # questionnaire features
print(mri.shape)
print(qdf.shape)

(146, 17594)
(630, 679)


In [3]:
regions = {
    'caudalanteriorcingulate',
    'caudalmiddlefrontal',
    'cuneus',
    'entorhinal',
    'fusiform',
    'inferiorparietal',
    'inferiortemporal',
    'insula',
    'isthmuscingulate',
    'lateraloccipital',
    'lateralorbitofrontal',
    'lingual',
    'medialorbitofrontal',
    'middletemporal',
    'paracentral',
    'parahippocampal',
    'parsopercularis',
    'parsorbitalis',
    'parstriangularis',
    'pericalcarine',
    'postcentral',
    'posteriorcingulate',
    'precentral',
    'precuneus',
    'rostralanteriorcingulate',
    'rostralmiddlefrontal',
    'superiorfrontal',
    'superiorparietal',
    'superiortemporal',
    'supramarginal',
    'transversetemporal'
}

See what underscore-delimited keywords we have in our feature set:

In [4]:
{thing.lower().replace("-", " ") for feature in list(mri.columns) for thing in feature.split("_")}

{"1st transverse temporal sulcus and heschl's sulcus",
 'adhd subtype',
 'anterior occipital sulcus',
 'ants',
 'area',
 'asd',
 'brain stem',
 'calcarine fissure',
 'central sulcus',
 'cingulate sulcus',
 'circular sulcus',
 'collateral sulcus',
 'cortex',
 'cortical',
 'csf',
 'ctx lh bankssts',
 'ctx lh caudalanteriorcingulate',
 'ctx lh caudalmiddlefrontal',
 'ctx lh cuneus',
 'ctx lh entorhinal',
 'ctx lh frontalpole',
 'ctx lh fusiform',
 'ctx lh inferiorparietal',
 'ctx lh inferiortemporal',
 'ctx lh insula',
 'ctx lh isthmuscingulate',
 'ctx lh lateraloccipital',
 'ctx lh lateralorbitofrontal',
 'ctx lh lingual',
 'ctx lh medialorbitofrontal',
 'ctx lh middletemporal',
 'ctx lh paracentral',
 'ctx lh parahippocampal',
 'ctx lh parsopercularis',
 'ctx lh parsorbitalis',
 'ctx lh parstriangularis',
 'ctx lh pericalcarine',
 'ctx lh postcentral',
 'ctx lh posteriorcingulate',
 'ctx lh precentral',
 'ctx lh precuneus',
 'ctx lh rostralanteriorcingulate',
 'ctx lh rostralmiddlefront

Features to focus on per Arno:
```Python
{'area', 'travel-depth', 'freesurfer-thickness', 'mean-curvature', 'volume', 'median', 'mad', 'laplace-beltrami-spectrum', 'zernike-moments'}
```
---
Features to ignore per Arno:
```Python
{'vessel', 'lesion', 'ants', 'geodesic', 'freesurfer-depth'}
```

In [5]:
mri_subset = mri[["EID"]].copy() # start with just EIDs
mri_subset = mri_subset.assign(
    left_cortical_grey_matter_volume=mri[
        [
            c for c in mri.columns if \
            "volume_per_freesurfer" in c and \
            "ctx-lh" in c
        ]
    ].sum(axis=1), # add left cortical grey matter volume
    right_cortical_grey_matter_volume=mri[
        [
            c for c in mri.columns if \
            "volume_per_freesurfer" in c and \
            "ctx-rh" in c
        ]
    ].sum(axis=1), # add right cortical grey matter volume
    left_cortical_white_matter_volume=mri[
        [
            c for c in mri.columns if \
            "volume_per_freesurfer" in c and \
            "wm-lh" in c
        ]
    ].sum(axis=1), # add left cortical white matter volume
    right_cortical_white_matter_volume=mri[
        [
            c for c in mri.columns if \
            "volume_per_freesurfer" in c and \
            "wm-rh" in c
        ]
    ].sum(axis=1), # add right cortical white matter volume
    csf_volume=mri[
        'volume_volume_per_freesurfer_label_CSF_ID'
    ], # add CSF volume
    whole_brain_volume=mri[
        [
            c for c in mri.columns if \
            "volume_per_freesurfer" in c
        ]
    ].sum(axis=1) # add whole brain volume
)
mri_subset = pd.concat(
    [
        mri_subset,
        pd.DataFrame(
            [pd.Series(
                mri[
                    [
                        c for c in mri.columns if \
                        "{0}: {1}_{2}".format(
                            feature,
                            statistic,
                            hemisphere
                        ) in c and "ctx-{0}h-{1}".format(
                            hemisphere[0],
                            region
                        ) in c
                    ]
                ].sum(axis=1),
                name="{1}_{2}_{3}_{0}".format(
                    "-".join(
                        feature.split(" ")
                    ),
                    statistic,
                    hemisphere,
                    region
                )
            ) for feature in [
                "freesurfer thickness",
                "travel depth"
            ] for statistic in [
                "median",
                "MAD"
            ] for hemisphere in [
                "left",
                "right"
            ] for region in regions]
        ).T, # add thickness and depth medians and MADs for all cortical regions
        pd.DataFrame(
            [pd.Series(
                mri[
                    [c for c in mri.columns if "{0}_{1}".format(
                feature,
                hemisphere
                    ) in c and "ctx-{0}h".format(
                        hemisphere[0],
                        region
                    ) in c]
                ].sum(axis=1),
                name="{1}_{2}_{0}".format(
                    "-".join(
                        feature.split(" ")
                    ),
                    hemisphere,
                    region
                )
            ) for feature in [
                "area"
            ] for hemisphere in [
                "left",
                "right"
            ] for region in regions]
        ).T, # add area for all cortical regions
        pd.DataFrame(
            [pd.Series(
                mri[
                    [
                        c for c in mri.columns if \
                        feature in c and \
                        region in c and \
                        hemisphere in c
                    ]
                ].sum(axis=1),
                name="{1}_{0}_{2}".format(
                    "-".join(
                        region.split("_")
                    ),
                    hemisphere,
                    "-".join(
                        feature.split("_")
                    )
                ).lower()
            ) for region in [
                "Amygdala",
                "Acumbens",
                "Caudate",
                "Cerebral-White-Matter",
                "Hippocampus",
                "Pallidum",
                "Putamen",
                "Thalamus",
                "UnsegmentedWhiteMatter"
            ] for hemisphere in [
                "Left",
                "Right"
            ] for feature in [
                "volume_per_freesurfer_label"
            ]]
        ).T # add volumes for subcortical regions
    ],
    axis=1
)

In [6]:
mri_subset.shape

(146, 335)

In [7]:
mri_subset["EID"] = mri_subset.EID.str.encode("UTF-8").str.decode("UTF-8")
qdf["EID"] = qdf.EID.str.encode("UTF-8").str.decode("UTF-8")

In [8]:
only_one_value = {c for c in mri_subset.columns if mri_subset[c].nunique()==1}
print(only_one_value)

{'left_acumbens_volume-per-freesurfer-label', 'right_acumbens_volume-per-freesurfer-label'}


In [9]:
mri_subset.drop(
    only_one_value,
    axis=1,
    inplace=True
)

In [10]:
mri_subset.shape

(146, 333)

In [11]:
mri_subset.to_csv(
    "train_test_data/mri_subset.csv",
    index=False
)

In [12]:
qdf

Unnamed: 0,APQ_P_01,APQ_P_02,APQ_P_03,APQ_P_04,APQ_P_05,APQ_P_06,APQ_P_07,APQ_P_08,APQ_P_09,APQ_P_10,...,SWAN_14,SWAN_15,SWAN_16,SWAN_17,SWAN_18,Sex,SocAnx_01,SocAnx_02,SocAnx_03,SocAnx_05
0,,,,,,,,,,,...,3.0,3.0,3.0,1.0,3.0,0.0,,,,
1,4.0,5.0,2.0,4.0,4.0,,4.0,2.0,5.0,1.0,...,0.0,-1.0,0.0,-2.0,0.0,1.0,,,,
2,4.0,5.0,2.0,3.0,5.0,2.0,4.0,1.0,5.0,1.0,...,1.0,2.0,3.0,2.0,3.0,1.0,,,,
3,5.0,5.0,3.0,2.0,3.0,3.0,3.0,2.0,5.0,1.0,...,2.0,0.0,1.0,1.0,1.0,0.0,,,,
4,3.0,5.0,3.0,5.0,5.0,1.0,5.0,3.0,5.0,1.0,...,2.0,2.0,2.0,2.0,2.0,0.0,,,,
5,4.0,3.0,3.0,4.0,3.0,2.0,3.0,3.0,5.0,2.0,...,0.0,0.0,-1.0,-1.0,0.0,1.0,,,,
6,5.0,5.0,2.0,5.0,3.0,1.0,4.0,1.0,5.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,,,,
7,4.0,4.0,3.0,3.0,2.0,2.0,4.0,2.0,5.0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,,,,
8,5.0,5.0,3.0,4.0,3.0,1.0,4.0,3.0,5.0,1.0,...,0.0,0.0,0.0,-1.0,0.0,0.0,,,,
9,4.0,4.0,3.0,3.0,4.0,1.0,3.0,2.0,4.0,1.0,...,2.0,2.0,2.0,2.0,2.0,0.0,,,,


In [13]:
qdf = qdf.merge(
    mri_subset,
    on="EID",
    how="outer"
)
qdf = qdf.drop(
    [
        col for col in qdf if \
        "IAT_" in col or \
        col=="Dx"
    ],
    axis=1
)

In [14]:
qdf.to_csv(
    "train_test_data/questions_and_mri_regions.csv",
    index=False
)

In [15]:
len(qdf.EID.unique())

630

In [16]:
len(mri.EID.unique())

146

In [17]:
mri_subset[mri_subset["EID"]=="NDARZY668NMV"]

Unnamed: 0,EID,csf_volume,left_cortical_grey_matter_volume,left_cortical_white_matter_volume,right_cortical_grey_matter_volume,right_cortical_white_matter_volume,whole_brain_volume,median_left_parsopercularis_freesurfer-thickness,median_left_entorhinal_freesurfer-thickness,median_left_lateralorbitofrontal_freesurfer-thickness,...,left_hippocampus_volume-per-freesurfer-label,right_hippocampus_volume-per-freesurfer-label,left_pallidum_volume-per-freesurfer-label,right_pallidum_volume-per-freesurfer-label,left_putamen_volume-per-freesurfer-label,right_putamen_volume-per-freesurfer-label,left_thalamus_volume-per-freesurfer-label,right_thalamus_volume-per-freesurfer-label,left_unsegmentedwhitematter_volume-per-freesurfer-label,right_unsegmentedwhitematter_volume-per-freesurfer-label
145,NDARZY668NMV,529.861,244180.591,170488.906,239569.517,169123.555,933167.762,3.041051,3.709109,2.99811,...,3959.882,3873.875,1692.997,1621.837,3725.924,3646.572,6441.783,6146.392,19291.049,18784.225
