### Notebook to combine and subset MRI features for BayesDB analysis.

In [None]:
import pandas as pd

In [None]:
mri = pd.read_csv(
    "train_test_data/all_MRI_features.csv"
) # MRI features
qdf = pd.read_csv(
    "train_test_data/drop_aggregates.csv"
)
qdf=qdf.drop_duplicates("EID") # questionnaire features
print(mri.shape)
print(qdf.shape)

In [None]:
regions = {
    'caudalanteriorcingulate',
    'caudalmiddlefrontal',
    'cuneus',
    'entorhinal',
    'fusiform',
    'inferiorparietal',
    'inferiortemporal',
    'insula',
    'isthmuscingulate',
    'lateraloccipital',
    'lateralorbitofrontal',
    'lingual',
    'medialorbitofrontal',
    'middletemporal',
    'paracentral',
    'parahippocampal',
    'parsopercularis',
    'parsorbitalis',
    'parstriangularis',
    'pericalcarine',
    'postcentral',
    'posteriorcingulate',
    'precentral',
    'precuneus',
    'rostralanteriorcingulate',
    'rostralmiddlefrontal',
    'superiorfrontal',
    'superiorparietal',
    'superiortemporal',
    'supramarginal',
    'transversetemporal'
}

See what underscore-delimited keywords we have in our feature set:

In [None]:
{thing.lower().replace("-", " ") for feature in list(mri.columns) for thing in feature.split("_")}

Features to focus on per Arno:
```Python
{'area', 'travel-depth', 'freesurfer-thickness', 'mean-curvature', 'volume', 'median', 'mad', 'laplace-beltrami-spectrum', 'zernike-moments'}
```
---
Features to ignore per Arno:
```Python
{'vessel', 'lesion', 'ants', 'geodesic', 'freesurfer-depth'}
```

In [None]:
mri_subset = mri[["EID"]].copy() # start with just EIDs
mri_subset = mri_subset.assign(
    left_cortical_grey_matter_volume=mri[
        [
            c for c in mri.columns if \
            "volume_per_freesurfer" in c and \
            "ctx-lh" in c
        ]
    ].sum(axis=1), # add left cortical grey matter volume
    right_cortical_grey_matter_volume=mri[
        [
            c for c in mri.columns if \
            "volume_per_freesurfer" in c and \
            "ctx-rh" in c
        ]
    ].sum(axis=1), # add right cortical grey matter volume
    left_cortical_white_matter_volume=mri[
        [
            c for c in mri.columns if \
            "volume_per_freesurfer" in c and \
            "wm-lh" in c
        ]
    ].sum(axis=1), # add left cortical white matter volume
    right_cortical_white_matter_volume=mri[
        [
            c for c in mri.columns if \
            "volume_per_freesurfer" in c and \
            "wm-rh" in c
        ]
    ].sum(axis=1), # add right cortical white matter volume
    csf_volume=mri[
        'volume_volume_per_freesurfer_label_CSF_ID'
    ], # add CSF volume
    whole_brain_volume=mri[
        [
            c for c in mri.columns if \
            "volume_per_freesurfer" in c
        ]
    ].sum(axis=1) # add whole brain volume
)
mri_subset = pd.concat(
    [
        mri_subset,
        pd.DataFrame(
            [pd.Series(
                mri[
                    [
                        c for c in mri.columns if \
                        "{0}: {1}_{2}".format(
                            feature,
                            statistic,
                            hemisphere
                        ) in c and "ctx-{0}h-{1}".format(
                            hemisphere[0],
                            region
                        ) in c
                    ]
                ].sum(axis=1),
                name="{1}_{2}_{3}_{0}".format(
                    "-".join(
                        feature.split(" ")
                    ),
                    statistic,
                    hemisphere,
                    region
                )
            ) for feature in [
                "freesurfer thickness",
                "travel depth"
            ] for statistic in [
                "median",
                "MAD"
            ] for hemisphere in [
                "left",
                "right"
            ] for region in regions]
        ).T, # add thickness and depth medians and MADs for all cortical regions
        pd.DataFrame(
            [pd.Series(
                mri[
                    [c for c in mri.columns if "{0}_{1}".format(
                feature,
                hemisphere
                    ) in c and "ctx-{0}h".format(
                        hemisphere[0],
                        region
                    ) in c]
                ].sum(axis=1),
                name="{1}_{2}_{0}".format(
                    "-".join(
                        feature.split(" ")
                    ),
                    hemisphere,
                    region
                )
            ) for feature in [
                "area"
            ] for hemisphere in [
                "left",
                "right"
            ] for region in regions]
        ).T, # add area for all cortical regions
        pd.DataFrame(
            [pd.Series(
                mri[
                    [
                        c for c in mri.columns if \
                        feature in c and \
                        region in c and \
                        hemisphere in c
                    ]
                ].sum(axis=1),
                name="{1}_{0}_{2}".format(
                    "-".join(
                        region.split("_")
                    ),
                    hemisphere,
                    "-".join(
                        feature.split("_")
                    )
                ).lower()
            ) for region in [
                "Amygdala",
                "Acumbens",
                "Caudate",
                "Cerebral-White-Matter",
                "Hippocampus",
                "Pallidum",
                "Putamen",
                "Thalamus",
                "UnsegmentedWhiteMatter"
            ] for hemisphere in [
                "Left",
                "Right"
            ] for feature in [
                "volume_per_freesurfer_label"
            ]]
        ).T # add volumes for subcortical regions
    ],
    axis=1
)

In [None]:
mri_subset.shape

In [None]:
mri_subset["EID"] = mri_subset.EID.str.encode("UTF-8").str.decode("UTF-8")
qdf["EID"] = qdf.EID.str.encode("UTF-8").str.decode("UTF-8")

In [None]:
only_one_value = {c for c in mri_subset.columns if mri_subset[c].nunique()==1}
print(only_one_value)

In [None]:
mri_subset.drop(
    only_one_value,
    axis=1,
    inplace=True
)

In [None]:
mri_subset.shape

In [None]:
mri_subset.to_csv(
    "train_test_data/mri_subset.csv",
    index=False
)

In [None]:
qdf

In [None]:
qdf = qdf.merge(
    mri_subset,
    on="EID",
    how="outer"
)
qdf = qdf.drop(
    [
        col for col in qdf if \
        "IAT_" in col or \
        col=="Dx"
    ],
    axis=1
)

In [None]:
qdf.to_csv(
    "train_test_data/questions_and_mri_regions.csv",
    index=False
)

In [None]:
len(qdf.EID.unique())

In [None]:
len(mri.EID.unique())

In [None]:
mri_subset[mri_subset["EID"]=="NDARZY668NMV"]