In [1]:
# imports
import sys
import numpy as np
import pandas as pd
import seaborn as sns
import missingno as msno
import scipy.stats as sstats
from os.path import join, exists
from sklearn.metrics import matthews_corrcoef

In [2]:
import latex

In [3]:
PROJ_DIR = "/Volumes/projects_herting/LABDOCS/Personnel/Katie/deltaABCD_clustering/"
DATA_DIR = "data/"
FIGS_DIR = "figures/"
OUTP_DIR = "output/"

In [4]:
df = pd.read_csv(join(PROJ_DIR, DATA_DIR, "data_qcd.csv"), index_col=0, header=0)

In [5]:
df.drop(list(df.filter(regex='lesion.*').columns), axis=1, inplace=True)

In [6]:
# check for motion correlations
motion_vars = ['rsfmri_var_meanmotion.baseline_year_1_arm_1',
       'rsfmri_var_meanmotion.2_year_follow_up_y_arm_1',
       'rsfmri_var_subthreshnvols.baseline_year_1_arm_1',
       'rsfmri_var_subthreshnvols.2_year_follow_up_y_arm_1',
       'rsfmri_var_subtcignvols.baseline_year_1_arm_1',
       'rsfmri_var_subtcignvols.2_year_follow_up_y_arm_1',
       'rsfmri_var_ntpoints.baseline_year_1_arm_1',
       'rsfmri_var_ntpoints.2_year_follow_up_y_arm_1']

age_vars = ['interview_age.baseline_year_1_arm_1', 'interview_age.2_year_follow_up_y_arm_1']

index = motion_vars + age_vars
columns = pd.MultiIndex.from_product([index, ['r', 'p']])

motion_age = pd.DataFrame(index=index, columns=columns)

temp_df = df[index].dropna(how='any', axis=0)
for motion in motion_vars:
    for age in age_vars:
        r, p = sstats.spearmanr(temp_df[motion], temp_df[age])
        motion_age.at[motion, (age, 'r')] = r
        motion_age.at[motion, (age, 'p')] = p

In [7]:
motion_age.dropna(how='all', axis=1).dropna(how='all', axis=0).to_csv('motion_age_spearmanr.csv')

In [8]:
!pwd

/Users/katherine.b/Dropbox/Projects/deltaABCD_clustering/notebooks


In [9]:
import matplotlib as mpl
mpl.rcParams.update(mpl.rcParamsDefault)

In [10]:
motion_base = ['rsfmri_var_meanmotion.baseline_year_1_arm_1',
       'rsfmri_var_subthreshnvols.baseline_year_1_arm_1',
       'rsfmri_var_subtcignvols.baseline_year_1_arm_1',
       'rsfmri_var_ntpoints.baseline_year_1_arm_1']

age_base = ['interview_age.baseline_year_1_arm_1']
baseline = motion_base + age_base
base_df = temp_df[baseline]

base_df.columns = ['meanmotion','subthreshnvols', 'subtcignvols','ntpoints', 'age']

motion_2yfu = ['rsfmri_var_meanmotion.2_year_follow_up_y_arm_1',
       'rsfmri_var_subthreshnvols.2_year_follow_up_y_arm_1',
       'rsfmri_var_subtcignvols.2_year_follow_up_y_arm_1',
       'rsfmri_var_ntpoints.2_year_follow_up_y_arm_1']

age_2yfu = ['interview_age.2_year_follow_up_y_arm_1']
baseline = motion_base + age_base
y2fu = motion_2yfu + age_2yfu
base_df = temp_df[baseline]
y2fu_df = temp_df[y2fu]

base_df.columns = ['meanmotion','subthreshnvols', 'subtcignvols','ntpoints', 'age']
y2fu_df.columns = ['meanmotion_y2','subthreshnvols_y2', 'subtcignvols_y2','ntpoints_y2', 'age_y2']

sns.set(context='talk', style='white')

import matplotlib.pyplot as plt
fig,ax = plt.subplots(ncols=2, figsize=(10,5), sharey=True)
plt.tight_layout(w_pad=1)
sns.regplot(x='age', y='meanmotion', data=base_df, ax=ax[0],
            marker='x', line_kws={"color": "orange"}, ci=99, x_ci=95,
            scatter_kws={'alpha': 0.4, 'linewidth': 0.7})
ax[0].text(120, 5.5, 'r = -0.131, p ≈ 0')
sns.regplot(x='age_y2', y='meanmotion_y2', data=y2fu_df, ax=ax[1],
            marker='x', line_kws={"color": "#FFFFFF"}, ci=99, x_ci=95,
            scatter_kws={'alpha': 0.4, 'linewidth': 0.7})
ax[1].text(145, 5.5, 'r = -0.143, p ≈ 0')
fig.savefig('motion_age_correlations.png', bbox_inches='tight', dpi=400)

In [11]:
no_2yfu = df[df["interview_date.2_year_follow_up_y_arm_1"].isna() == True].index
df = df.drop(no_2yfu, axis=0)

In [12]:
deltasmri_complete = pd.concat([df.filter(regex='smri.*change_score'), 
                                df.filter(regex='mrisdp.*change_score')], axis=1).dropna()
deltarsfmri_complete = df.filter(regex='rsfmri.*change_score').dropna(how='any')
deltarsi_complete = df.filter(regex='dmri_rsi.*change_score').dropna()
deltadti_complete = df.filter(regex='dmri_dti.*change_score').dropna()

In [13]:
print(f'smri cols: {len(deltasmri_complete.columns)}')
print(f'rsfmri cols: {len(deltarsfmri_complete.columns)}')
print(f'rsi cols: {len(deltarsi_complete.columns)}')
print(f'dti cols: {len(deltadti_complete.columns)}')


smri cols: 293
rsfmri cols: 758
rsi cols: 251
dti cols: 175


In [14]:
imaging_qc = [
    "imgincl_dmri_include.baseline_year_1_arm_1",
    "imgincl_rsfmri_include.baseline_year_1_arm_1",
    "imgincl_t1w_include.baseline_year_1_arm_1",
    
    "imgincl_dmri_include.2_year_follow_up_y_arm_1",
    "imgincl_rsfmri_include.2_year_follow_up_y_arm_1",
    "imgincl_t1w_include.2_year_follow_up_y_arm_1",
    
]
nihtb = [
    "nihtbx_picvocab_uncorrected.change_score",
    #"nihtbx_flanker_uncorrected.change_score",
    #'nihtbx_list_uncorrected.change_score',
    #'nihtbx_cardsort_uncorrected.change_score',
    "nihtbx_pattern_uncorrected.change_score",
    "nihtbx_picture_uncorrected.change_score",
    "nihtbx_reading_uncorrected.change_score",
]
puberty = [  #'sex.baseline_year_1_arm_1', 'sex.2_year_follow_up_y_arm_1',
    "pds_p_ss_male_category_2.baseline_year_1_arm_1",
    "pds_p_ss_female_category_2.baseline_year_1_arm_1",
    "pds_p_ss_male_category_2.2_year_follow_up_y_arm_1",
    "pds_p_ss_female_category_2.2_year_follow_up_y_arm_1",
]
demographics = [
    "race_ethnicity.baseline_year_1_arm_1",
    #"demo_prnt_ethn_v2.baseline_year_1_arm_1",
    "demo_prnt_marital_v2.baseline_year_1_arm_1",
    "demo_prnt_ed_v2.baseline_year_1_arm_1",
    "demo_comb_income_v2.baseline_year_1_arm_1",
    #"demo_race_a_p___10.baseline_year_1_arm_1",
    #"demo_race_a_p___11.baseline_year_1_arm_1",
    #"demo_race_a_p___12.baseline_year_1_arm_1",
    #"demo_race_a_p___13.baseline_year_1_arm_1",
    #"demo_race_a_p___14.baseline_year_1_arm_1",
    #"demo_race_a_p___15.baseline_year_1_arm_1",
    #"demo_race_a_p___16.baseline_year_1_arm_1",
    #"demo_race_a_p___17.baseline_year_1_arm_1",
    #"demo_race_a_p___18.baseline_year_1_arm_1",
    #"demo_race_a_p___19.baseline_year_1_arm_1",
    #"demo_race_a_p___20.baseline_year_1_arm_1",
    #"demo_race_a_p___21.baseline_year_1_arm_1",
    #"demo_race_a_p___22.baseline_year_1_arm_1",
    #"demo_race_a_p___23.baseline_year_1_arm_1",
    #"demo_race_a_p___24.baseline_year_1_arm_1",
    #"demo_race_a_p___25.baseline_year_1_arm_1",
]
age = [  #'mri_info_manufacturer.baseline_year_1_arm_1',
    #'mri_info_manufacturer.2_year_follow_up_y_arm_1',
    "interview_age.baseline_year_1_arm_1",
    "interview_age.2_year_follow_up_y_arm_1",
]
family = ["rel_family_id.baseline_year_1_arm_1", 
          "rel_group_id.baseline_year_1_arm_1", 
          "rel_ingroup_order.baseline_year_1_arm_1", 
          "rel_relationship.baseline_year_1_arm_1"]
site = ["site_id_l.baseline_year_1_arm_1", "site_id_l.2_year_follow_up_y_arm_1"]
sex = ["sex.baseline_year_1_arm_1", "sex.2_year_follow_up_y_arm_1"]


In [15]:
miss_vars = nihtb + age + family + sex + imaging_qc + puberty
matrix_df = df[miss_vars]
msno.matrix(matrix_df, labels=True, fontsize=12)

<AxesSubplot:>

In [16]:
msno.heatmap(matrix_df, cmap="RdBu_r")

<AxesSubplot:>

In [17]:
deltarsfmri_complete

Unnamed: 0_level_0,rsfmri_var_cdk_banksstslh.change_score,rsfmri_var_cdk_cdaclatelh.change_score,rsfmri_var_cdk_cdmdflh.change_score,rsfmri_var_cdk_cuneuslh.change_score,rsfmri_var_cdk_entorhinallh.change_score,rsfmri_var_cdk_fflh.change_score,rsfmri_var_cdk_ifpalh.change_score,rsfmri_var_cdk_iftlh.change_score,rsfmri_var_cdk_ihclatelh.change_score,rsfmri_var_cdk_loccipitallh.change_score,...,rsfmri_cor_ngd_vs_scs_vtdclh.change_score,rsfmri_cor_ngd_vs_scs_crcxrh.change_score,rsfmri_cor_ngd_vs_scs_thprh.change_score,rsfmri_cor_ngd_vs_scs_cderh.change_score,rsfmri_cor_ngd_vs_scs_ptrh.change_score,rsfmri_cor_ngd_vs_scs_plrh.change_score,rsfmri_cor_ngd_vs_scs_hprh.change_score,rsfmri_cor_ngd_vs_scs_agrh.change_score,rsfmri_cor_ngd_vs_scs_aarh.change_score,rsfmri_cor_ngd_vs_scs_vtdcrh.change_score
subjectkey,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
NDAR_INV07HGA3B7,-3.510904,6.816887,20.529632,7.077337,-5.398273,26.059689,2.324849,-3.324514,41.272466,20.840076,...,-15.511061,10.663775,-22.597722,-40.484814,-48.072443,2.832451,-29.214148,-34.080908,-4.844560,-81.779399
NDAR_INV07RAHHYH,12.716948,-5.441272,-6.220807,35.983402,10.731715,45.870853,-0.454510,41.990798,24.429061,9.868687,...,-39.212219,-65.241769,-11.203119,34.155036,-85.652802,50.745673,70.257494,26.406865,28.614891,68.106259
NDAR_INV08P1JKNE,-5.791221,18.208679,23.386007,23.601652,-6.164564,-28.694095,8.187601,-39.242856,-15.345490,68.633201,...,23.630493,6.643079,-15.532718,3.345363,32.369261,-15.201887,14.482713,21.155580,-32.473666,-44.791534
NDAR_INV09AUXBBT,-1.385965,-15.345180,-17.413655,-26.633715,39.545188,5.500809,-24.522431,1.920222,-14.845889,-20.426550,...,79.711606,-51.659390,61.825338,38.602896,-63.194880,36.207358,-73.168232,-101.697846,48.663471,-25.333113
NDAR_INV09ZE6UUK,-9.499477,-27.747219,8.837471,29.070709,38.181672,8.377421,17.351787,40.837169,7.243915,19.544870,...,-10.903250,18.488198,-102.973622,38.159818,39.129259,6.207324,-14.976752,-42.998545,93.467923,-12.833558
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
NDAR_INVXWW4FAU3,19.921558,15.802036,-2.317442,11.338673,55.608978,18.754147,20.673672,-31.688559,6.317147,8.719335,...,-31.256934,-88.044137,-10.551336,-22.827219,13.754953,-41.945378,73.868222,-53.056806,80.553919,51.698743
NDAR_INVZKJXBJMM,-26.630009,8.937675,-5.598836,-4.679926,-31.763326,-2.884219,10.909073,21.029526,10.411190,-16.751708,...,-52.882908,21.942404,83.265807,99.806260,57.948480,-71.342659,-5.686196,-44.861525,4.856022,-69.029099
NDAR_INVZKT1BV8Y,9.252107,21.556638,-19.208691,-13.367069,-35.797935,25.016467,-1.483524,-15.902409,14.097157,24.947534,...,-88.937296,-1.965248,-47.346099,1.609085,36.732543,50.286582,45.963100,24.666901,-62.791826,16.651771
NDAR_INVVKPJ803Y,-9.487092,-4.485570,-0.247189,-7.580429,24.460214,20.341336,6.715296,30.298367,8.139562,22.293039,...,-38.036147,-18.206241,18.967250,-95.824764,-2.613048,63.557645,-2.905922,-52.975368,4.599954,-71.405166


In [18]:
sex = pd.get_dummies(df[["sex.baseline_year_1_arm_1", "sex.2_year_follow_up_y_arm_1"]])
sex_cols = list(sex.columns)
df = pd.concat([df, sex], axis=1)
df.drop(
    ["sex.baseline_year_1_arm_1", "sex.2_year_follow_up_y_arm_1"], axis=1, inplace=True
)
mri = pd.get_dummies(
    df[
        [
            "mri_info_manufacturer.baseline_year_1_arm_1",
            "mri_info_manufacturer.2_year_follow_up_y_arm_1",
        ]
    ]
)
mri_cols = list(mri.columns)
df = pd.concat([df, mri], axis=1)
df.drop(
    [
        "mri_info_manufacturer.baseline_year_1_arm_1",
        "mri_info_manufacturer.2_year_follow_up_y_arm_1",
    ],
    axis=1,
    inplace=True,
)
site = pd.get_dummies(
    df[["site_id_l.baseline_year_1_arm_1", "site_id_l.2_year_follow_up_y_arm_1"]]
)
#site_cols = list(site.columns)#
#df = df = pd.concat([df, site], axis=1)
#df.drop(
#    ["site_id_l.baseline_year_1_arm_1", "site_id_l.2_year_follow_up_y_arm_1"],
#    axis=1,
#    inplace=True,
#)
site_cols = ["site_id_l.baseline_year_1_arm_1", "site_id_l.2_year_follow_up_y_arm_1"]

What am I going to do?
If I add baseline and 2yfu imaging QC variables then any value less than 2 is an exclude.
and I can convert everything less than 2 to np.nan to use for missingness assessments. 
Need to find out what the "mean motion" value is. FD? yes, in mm.

In [19]:
# build a mini dataset that represents missingness on each variable
keep = nihtb + puberty + demographics + age + family + sex_cols + mri_cols + site_cols
miss_df = df[mri_cols]

In [20]:
missing = miss_df.isna().replace({True: 1, False: 0})

for col1 in missing:
    for col2 in miss_df:
        if col1 != col2:
            temp_df = pd.concat([missing[col1], miss_df[col2]], axis=1)
            temp_df.dropna(how="any", inplace=True)
            if len(miss_df[col2].dropna().unique()) == 2:
                mcc = matthews_corrcoef(temp_df[col1].values, temp_df[col2].values)
                print(
                    f"{col1} missingness assoc with {col2} value\nmcc = {np.round(mcc, 4)}"
                )
            else:
                pbr = sstats.pointbiserialr(temp_df[col1].values, temp_df[col2].values)
                if pbr[1] < 0.01:
                    print(
                        f"{col1} missingness assoc with {col2} value\nr = {np.round(pbr[0], 4)}\t\tp = {np.round(pbr[1], 4)}"
                    )
                else:
                    pass
        else:
            pass

missing.corr(method="kendall")

# grab imaging var names bc missingness is redundant across those
dmri_rgx = 'dmri.*'
dmri_cols = df.filter(regex=dmri_rgx, axis=1).columns
smri_rgx = 'smri.*'
smri_cols = df.filter(regex=smri_rgx, axis=1).columns
fmri_rgx = 'rsfmri.*'
fmri_cols = df.filter(regex=fmri_rgx, axis=1).columns

In [22]:
# using ABCD's inclusion criteria to further mask the data
# add include from baseline and y2, then convert all values <2 to nan

modalities = ["t1w", "dmri", "rsfmri"]
for modality in modalities:
    missing = pd.Series(index=miss_df.index)
    missing = (
        df[f"imgincl_{modality}_include.baseline_year_1_arm_1"]
        + df[f"imgincl_{modality}_include.2_year_follow_up_y_arm_1"]
    )
    for i in miss_df.index:
        if missing.loc[i] == 2.0:
            miss_df.loc[i, f"{modality}_missing"] = 1
        else:
            miss_df.loc[i, f"{modality}_missing"] = np.nan

  missing = pd.Series(index=miss_df.index)


In [28]:
miss_df.sum()

mri_info_manufacturer.baseline_year_1_arm_1_GE MEDICAL SYSTEMS            2013.0
mri_info_manufacturer.baseline_year_1_arm_1_Philips Medical Systems        905.0
mri_info_manufacturer.baseline_year_1_arm_1_SIEMENS                       4539.0
mri_info_manufacturer.2_year_follow_up_y_arm_1_GE MEDICAL SYSTEMS         1995.0
mri_info_manufacturer.2_year_follow_up_y_arm_1_Philips Medical Systems     904.0
mri_info_manufacturer.2_year_follow_up_y_arm_1_SIEMENS                    4558.0
t1w_missing                                                               7115.0
dmri_missing                                                              6250.0
rsfmri_missing                                                            5661.0
dtype: float64

In [25]:
msno.matrix(miss_df, labels=True, fontsize=12, inline=True)

<AxesSubplot:>

In [26]:
# calculate percent misingness on each var
pct_miss = miss_df.isna().sum() / len(miss_df.index)

In [27]:
deltarsfmri_complete

Unnamed: 0_level_0,rsfmri_var_cdk_banksstslh.change_score,rsfmri_var_cdk_cdaclatelh.change_score,rsfmri_var_cdk_cdmdflh.change_score,rsfmri_var_cdk_cuneuslh.change_score,rsfmri_var_cdk_entorhinallh.change_score,rsfmri_var_cdk_fflh.change_score,rsfmri_var_cdk_ifpalh.change_score,rsfmri_var_cdk_iftlh.change_score,rsfmri_var_cdk_ihclatelh.change_score,rsfmri_var_cdk_loccipitallh.change_score,...,rsfmri_cor_ngd_vs_scs_vtdclh.change_score,rsfmri_cor_ngd_vs_scs_crcxrh.change_score,rsfmri_cor_ngd_vs_scs_thprh.change_score,rsfmri_cor_ngd_vs_scs_cderh.change_score,rsfmri_cor_ngd_vs_scs_ptrh.change_score,rsfmri_cor_ngd_vs_scs_plrh.change_score,rsfmri_cor_ngd_vs_scs_hprh.change_score,rsfmri_cor_ngd_vs_scs_agrh.change_score,rsfmri_cor_ngd_vs_scs_aarh.change_score,rsfmri_cor_ngd_vs_scs_vtdcrh.change_score
subjectkey,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
NDAR_INV07HGA3B7,-3.510904,6.816887,20.529632,7.077337,-5.398273,26.059689,2.324849,-3.324514,41.272466,20.840076,...,-15.511061,10.663775,-22.597722,-40.484814,-48.072443,2.832451,-29.214148,-34.080908,-4.844560,-81.779399
NDAR_INV07RAHHYH,12.716948,-5.441272,-6.220807,35.983402,10.731715,45.870853,-0.454510,41.990798,24.429061,9.868687,...,-39.212219,-65.241769,-11.203119,34.155036,-85.652802,50.745673,70.257494,26.406865,28.614891,68.106259
NDAR_INV08P1JKNE,-5.791221,18.208679,23.386007,23.601652,-6.164564,-28.694095,8.187601,-39.242856,-15.345490,68.633201,...,23.630493,6.643079,-15.532718,3.345363,32.369261,-15.201887,14.482713,21.155580,-32.473666,-44.791534
NDAR_INV09AUXBBT,-1.385965,-15.345180,-17.413655,-26.633715,39.545188,5.500809,-24.522431,1.920222,-14.845889,-20.426550,...,79.711606,-51.659390,61.825338,38.602896,-63.194880,36.207358,-73.168232,-101.697846,48.663471,-25.333113
NDAR_INV09ZE6UUK,-9.499477,-27.747219,8.837471,29.070709,38.181672,8.377421,17.351787,40.837169,7.243915,19.544870,...,-10.903250,18.488198,-102.973622,38.159818,39.129259,6.207324,-14.976752,-42.998545,93.467923,-12.833558
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
NDAR_INVXWW4FAU3,19.921558,15.802036,-2.317442,11.338673,55.608978,18.754147,20.673672,-31.688559,6.317147,8.719335,...,-31.256934,-88.044137,-10.551336,-22.827219,13.754953,-41.945378,73.868222,-53.056806,80.553919,51.698743
NDAR_INVZKJXBJMM,-26.630009,8.937675,-5.598836,-4.679926,-31.763326,-2.884219,10.909073,21.029526,10.411190,-16.751708,...,-52.882908,21.942404,83.265807,99.806260,57.948480,-71.342659,-5.686196,-44.861525,4.856022,-69.029099
NDAR_INVZKT1BV8Y,9.252107,21.556638,-19.208691,-13.367069,-35.797935,25.016467,-1.483524,-15.902409,14.097157,24.947534,...,-88.937296,-1.965248,-47.346099,1.609085,36.732543,50.286582,45.963100,24.666901,-62.791826,16.651771
NDAR_INVVKPJ803Y,-9.487092,-4.485570,-0.247189,-7.580429,24.460214,20.341336,6.715296,30.298367,8.139562,22.293039,...,-38.036147,-18.206241,18.967250,-95.824764,-2.613048,63.557645,-2.905922,-52.975368,4.599954,-71.405166


In [None]:
from sklearn.decomposition import PCA
pca = PCA(n_components=3)
smri_comp = pca.fit_transform(deltasmri_complete)
rsfmri_comp = pca.fit_transform(deltarsfmri_complete)
rsi_comp = pca.fit_transform(deltarsi_complete)
dti_comp = pca.fit_transform(deltadti_complete)

In [None]:
smri_components = pd.DataFrame(index=deltasmri_complete.index, 
                               data=smri_comp, 
                               columns=['smri1', 'smri2', 'smri3'])
rsfmri_components = pd.DataFrame(index=deltarsfmri_complete.index, 
                                 data=rsfmri_comp, 
                                 columns=['rsfmri1', 'rsfmri2', 'rsfmri3'])
rsi_components = pd.DataFrame(index=deltarsi_complete.index, 
                              data=rsi_comp, 
                              columns=['rsi1', 'rsi2', 'rsi3'])
dti_components = pd.DataFrame(index=deltadti_complete.index, 
                              data=dti_comp, 
                              columns=['dti1', 'dti2', 'dti3'])

In [None]:
miss_w_comps = pd.concat([miss_df, smri_components, rsfmri_components, rsi_components, dti_components], axis=1)

In [None]:
msno.matrix(miss_w_comps)

In [None]:
msno.heatmap(miss_w_comps, cmap='RdBu_r')

In [None]:
miss_w_comps.to_csv(join(PROJ_DIR, DATA_DIR, 'img_components_and_data.csv'))

In [None]:
miss_prop = miss_w_comps.isna().sum() / len(miss_w_comps.index)

In [None]:
miss_prop.sort_values()

In [None]:
m_base_puberty_missing = df['pds_p_ss_male_category_2.baseline_year_1_arm_1'].isna().sum() 
f_base_puberty_missing = df['pds_p_ss_female_category_2.baseline_year_1_arm_1'].isna().sum()
(m_base_puberty_missing + f_base_puberty_missing) / len(df.index)

In [None]:
len(df.index)