Notebook used to create the clinical dataset from ADNI neuropsychological assessments and clinical evaluations.

> **IMPORTANT**: The data used as input for this notebook must be downloaded directly from the ADNI website.

In [None]:
import os
import pandas as pd 
import numpy as np
import warnings
from datetime import datetime
from pathlib import Path
from tqdm import tqdm
from IPython.display import display

In [None]:
# path definition
PATH_TO_ADNI_DATA = Path(os.path.join('..', 'data', 'adni'))
PATH_TO_OUTPUT_DATA = PATH_TO_ADNI_DATA / 'processed'
PATH_TO_DOWNLOAD_DATA = PATH_TO_ADNI_DATA / 'download'

# diagnostic information files
PATH_TO_DXSUM = PATH_TO_DOWNLOAD_DATA / 'All_Subjects_DXSUM_28Apr2024.csv'
PATH_TO_DEMO = PATH_TO_DOWNLOAD_DATA / 'PTDEMOG_27Mar2025.csv'

# neuropsychological information files
PATH_TO_NEUROBAT = PATH_TO_DOWNLOAD_DATA / 'All_Subjects_NEUROBAT_28Apr2024.csv'
PATH_TO_ADAS_1 = PATH_TO_DOWNLOAD_DATA / 'All_Subjects_ADASSCORES_28Apr2024.csv'
PATH_TO_ADAS_GO23 = PATH_TO_DOWNLOAD_DATA / 'All_Subjects_ADAS_ADNIGO23_28Apr2024.csv'

# MMSE and CDR information
PATH_TO_MMSE = PATH_TO_DOWNLOAD_DATA / 'All_Subjects_MMSE_28Apr2024.csv'
PATH_TO_CDR = PATH_TO_DOWNLOAD_DATA / 'All_Subjects_CDR_28Apr2024.csv'

# Neuroimaging QC information files
PATH_TO_AMY_QC1 = PATH_TO_DOWNLOAD_DATA / 'All_Subjects_AMYQC_28Apr2024.csv'
PATH_TO_AMY_QC2 = PATH_TO_DOWNLOAD_DATA / 'All_Subjects_AV45QC_28Apr2024.csv'
PATH_TO_FDG_QC1 = PATH_TO_DOWNLOAD_DATA / 'All_Subjects_PETC3_28Apr2024.csv'
PATH_TO_FDG_QC2 = PATH_TO_DOWNLOAD_DATA / 'All_Subjects_PETQC_28Apr2024.csv'

# Diagnostic information

In [None]:
def mapDiagnostic(v):
    """ Function used to map the etiologies """
    if pd.isna(v) or v == '-4':
        return np.nan

    mapping = {
        '1': 'ftd', '2': 'parkinson', '3': 'huntington', '4': 'psp',
        '5': 'oh', '6': 'nph', '7': 'mdd', '8': 'corticobasal',
        '9': 'vascular', '10': 'prion', '11': 'hiv', '12': 'ppa',
        '13': 'corticopost', '14': 'other'
    }

    if isinstance(v, (float, int)):
        v = str(int(v))  # Convertir números a strings sin decimales
    
    if isinstance(v, str):
        return '|'.join(filter(None, (mapping.get(diag) for diag in v.split('|'))))

    raise ValueError(f"Invalid input type: {type(v)}")


def calculateFutureVarValue(
    df: pd.DataFrame,
    y_followup: int,
    m_window: int,
    target_var: str,
    group_id: str,
    date_id: str
) -> pd.DataFrame:
    """ Subroutine used to calculate the value of a given variable X years in the future
    timmed by a X window constraint (in months) """

    df = df.copy()
    
    df['years_diff_%dY' % y_followup] = np.nan
    df['%s_%dY' % (target_var, y_followup)] = np.nan
    df['date_%dY' %  y_followup] = np.nan
    df['__date'] = df.index.get_level_values(date_id)
    new_df = []
    for sub_id, sub_df in tqdm(df.groupby(group_id), desc='Crossing data...'):
        
        # subject without followup
        if sub_df.shape[0] == 1:
            new_df.append(sub_df)
            continue
    
        # create all possible row combinations
        sub_df_cross = sub_df.reset_index()[[group_id, date_id, target_var, '__date']].merge(sub_df.reset_index()[[target_var, '__date']], how='cross')
        
        # select only entries in window
        sub_df_cross = sub_df_cross.loc[
            ((sub_df_cross['__date_y'] - sub_df_cross['__date_x']).dt.days - 365 * y_followup).abs() < (m_window*30)]
    
        # select/create variables of interest
        sub_df_cross['years_diff_%dY' % y_followup] = (sub_df_cross['__date_y'] - sub_df_cross['__date_x']).dt.days / 365
        sub_df_cross['%s_%dY' % (target_var, y_followup)] = sub_df_cross['%s_y' % target_var]
        sub_df_cross['date_%dY' %  y_followup] = sub_df_cross['__date_y']
    
        # add information
        sub_df_cross = sub_df_cross.set_index([group_id, date_id])[[
            'years_diff_%dY' % y_followup,
            '%s_%dY' % (target_var, y_followup),
            'date_%dY' %  y_followup
        ]]
        
        if sub_df_cross.index.duplicated().any():
            # select the entry closests to the target followup period
            sub_df_cross['__grouping_ind'] = (sub_df_cross['years_diff_%dY' % y_followup] - y_followup).abs()
            sub_df_cross = sub_df_cross\
                .reset_index().set_index([group_id, date_id, '__grouping_ind']).sort_index()\
                .groupby([group_id, date_id]).nth(0).reset_index('__grouping_ind').drop(columns=['__grouping_ind'])
            assert not sub_df_cross.index.duplicated().any()
        else:
            sub_df.loc[sub_df_cross.index, sub_df_cross.columns] = sub_df_cross
    
        new_df.append(sub_df.copy())
        
    # format the final dataframe
    new_df = pd.concat(new_df, axis=0)
    new_df['date_%dY' % y_followup] = pd.to_datetime(new_df['date_%dY' % y_followup])
    new_df = new_df.drop(columns=['__date'])

    assert new_df.shape[0] == df.shape[0], 'Shape missmatch'

    return new_df

In [None]:
# load diagnostic information
dx = pd.read_csv(PATH_TO_DXSUM)
print('Initial shape: %d' % dx.shape[0])

# remove entries without date
dx = dx.dropna(subset=['EXAMDATE', 'PTID'])
print('Shape after removing key variables: %d' % dx.shape[0])

# process variables
dx['EXAMDATE'] = pd.to_datetime(dx['EXAMDATE'])
dx = dx.rename(columns={
    'PTID': 'subject_id',
    'EXAMDATE': 'diag_date'
})

# Codify variables. Variable codes:
#
#     - DIAGNOSIS: 1=CN, 2=MCI, 3=Dementia
dx['DIAGNOSIS'] = dx['DIAGNOSIS'].apply(
    lambda v: {1: 'control', 2: 'mci', 3: 'dementia'}.get(int(v), np.nan) if not pd.isna(v) else np.nan
)
dx = dx.rename(columns={'DIAGNOSIS': 'diagnosis'})

#     - DXMPTR3: (Petersen) Normal general cognitive function (1=Yes; 0=No; 2=Marginal)
dx['DXMPTR3'] = dx['DXMPTR3'].apply(
    lambda v: {1: 'yes', 0: 'no', 2: 'marginal'}.get(int(v), np.nan) if not pd.isna(v) else np.nan
)
dx = dx.rename(columns={'DXMPTR3': 'petersen_normal_cognitive_func'})

#     - DXMPTR4: (Petersen) Normal activities of daily living (1=Yes; 0=No; 2=Marginal)
dx['DXMPTR4'] = dx['DXMPTR4'].apply(
    lambda v: {1: 'yes', 0: 'no', 2: 'marginal'}.get(int(v), np.nan) if not pd.isna(v) else np.nan
)
dx = dx.rename(columns={'DXMPTR4': 'petersen_normal_daily_living'})

#     - DXMPTR5: (Petersen) Objective memory impairment for age and education (1=Yes; 0=No)
dx['DXMPTR5'] = dx['DXMPTR5'].apply(
    lambda v: {1: 'yes', 0: 'no'}.get(int(v), np.nan) if not pd.isna(v) else np.nan
)
dx = dx.rename(columns={'DXMPTR5': 'petersen_objective_ci'})

#     - DXMPTR6: (Petersen) Not demented by diagnostic criteria (1=Yes; 0=No)
# IMPORTANT: I have reversed the logic
dx['DXMPTR6'] = dx['DXMPTR6'].apply(
    lambda v: {0: 'yes', 1: 'no'}.get(int(v), np.nan) if not pd.isna(v) else np.nan
)
dx = dx.rename(columns={'DXMPTR6': 'petersen_demented_by_diag'})

#     - DXMDUE: Suspected cause of MCI (1=MCI due to Alzheimer's Disease; 2=MCI due to other etiology)
dx['DXMDUE'] = dx['DXMDUE'].apply(
    lambda v: {1: 'ad', 2: 'other'}.get(int(v), np.nan) if not pd.isna(v) else np.nan
)
dx = dx.rename(columns={'DXMDUE': 'cause_mci'})

#     - DXMOTHET: If MCI due to other etiology, select box(es) to indicate reason (1=Fronto-temporal 
#                 Dementia; 2=Parkinson's Disease; 3=Huntington's Disease; 4=Progressive Supranuclear 
#                 Palsy; 5=Alcohol-related Dementia; 6=NPH; 7=Major Depression; 8=Corticobasal Degeneration; 
#                 9=Vascular Dementia; 10=Prion-Associated Dementia; 11=HIV; 12=Primary Progressive Aphasia; 
#                 13=Posterior Cortical Dysfunction; 14=Other (specify))
dx['DXMOTHET'] = dx['DXMOTHET'].apply(mapDiagnostic)
dx = dx.rename(columns={'DXMOTHET': 'cause_mci_non_ad_eth'})

# merge this variable with cause_mci
dx.loc[dx.cause_mci != 'ad', 'cause_mci'] =\
    dx.loc[dx.cause_mci != 'ad', 'cause_mci_non_ad_eth']

#     - DXDSEV: Dementia Severity - Clinician's Impression (1=Mild; 2=Moderate; 3=Severe)
dx['DXDSEV'] = dx['DXDSEV'].apply(
    lambda v: {1: 'mild', 2: 'moderate', 3: 'severe'}.get(int(v), np.nan) if not pd.isna(v) else np.nan
)
dx = dx.rename(columns={'DXDSEV': 'dementia_stage'})

#     - DXAPP: If Dementia due to Alzheimer's Disease, indicate likelihood (1=Probable; 2=Possible)
dx['DXAPP'] = dx['DXAPP'].apply(
    lambda v: {1: 'ad_probable', 2: 'ad_possible'}.get(int(v), np.nan) if not pd.isna(v) else np.nan
)
dx = dx.rename(columns={'DXAPP': 'cause_dementia'})

#     - DXODES: If dementia due to other etiology, select best diagnosis: 1=Fronto-temporal Dementia; 
#               2=Parkinson's Disease; 3=Huntington's Disease; 4=Progressive Supranuclear Palsy; 5=Alcohol-related 
#               Dementia; 6=NPH; 7=Major Depression; 8=Corticobasal Degeneration; 9=Vascular Dementia; 10=Prion-
#               Associated Dementia; 11=HIV; 12=Primary Progressive Aphasia; 13=Posterior Cortical Dysfunction; 14=Other (specify)
dx['DXODES'] = dx['DXODES'].apply(mapDiagnostic)
dx = dx.rename(columns={'DXODES': 'cause_dementia_non_ad_eth'})

# merge this variable with cause_dementia
dx.loc[~dx.cause_dementia.isin(['ad_probable', 'ad_possible']), 'cause_dementia'] =\
    dx.loc[~dx.cause_dementia.isin(['ad_probable', 'ad_possible']), 'cause_mci_non_ad_eth']

# create a primary diagnosis
dx['primary_diagnosis'] = np.nan
dx.loc[(dx['diagnosis'] == 'dementia'), 'primary_diagnosis'] =\
    dx.loc[(dx['diagnosis'] == 'dementia'), 'cause_dementia']

dx.loc[(dx['diagnosis'] == 'mci'), 'primary_diagnosis'] =\
    dx.loc[(dx['diagnosis'] == 'mci'), 'cause_dementia']
dx.loc[(dx['diagnosis'] == 'mci'), 'primary_diagnosis'] =\
    dx.loc[(dx['diagnosis'] == 'mci'), 'cause_mci']

dx.loc[(dx['diagnosis'] == 'control'), 'primary_diagnosis'] =\
    dx.loc[(dx['diagnosis'] == 'control'), 'cause_dementia']
dx.loc[(dx['diagnosis'] == 'control'), 'primary_diagnosis'] =\
    dx.loc[(dx['diagnosis'] == 'control'), 'cause_mci']

dx.loc[(dx['diagnosis'] == 'control') & (dx['primary_diagnosis'].isna()), 'primary_diagnosis'] = 'control'

# format the index and select the variables of interest removing duplicated
dx = dx[[
    'subject_id',
    'diag_date',
    'diagnosis',
    'primary_diagnosis',
    'petersen_normal_cognitive_func',
    'petersen_normal_daily_living',
    'petersen_objective_ci',
    'petersen_demented_by_diag',
    'cause_mci',
    'cause_dementia',
    'cause_mci_non_ad_eth',
    'cause_dementia_non_ad_eth',
    'dementia_stage'
]].copy()
dx = dx.drop_duplicates().set_index(['subject_id', 'diag_date']).sort_index()

# drop duplicated indices keeping the last
dx = dx.groupby(['subject_id', 'diag_date']).nth(-1)

In [None]:
# add diagnostic changes information for 2Y and 4Y (using a +/- 6 month window)
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    dx = calculateFutureVarValue(
        df=dx,
        y_followup=2,
        m_window=6,
        target_var='diagnosis',
        group_id='subject_id',
        date_id='diag_date'
    )
    dx = calculateFutureVarValue(
        df=dx,
        y_followup=4,
        m_window=6,
        target_var='diagnosis',
        group_id='subject_id',
        date_id='diag_date'
    )

In [None]:
# load demographic information
demo_variables = {
    'PTID': 'subject_id',
    'PTEDUCAT': 'yschooling',
    'PTGENDER': 'sex_1M_2F',
    'PTDOB': 'date_of_birth',
}
demo = pd.read_csv(PATH_TO_DEMO)
print('Initial shape: %d' % demo.shape[0])

# process variables
demo = demo.rename(columns=demo_variables)[list(demo_variables.values())]

# drop entries with missing values
demo = demo.dropna(how='any')
demo = demo.loc[
    (demo['yschooling'] > 0) &
    demo['sex_1M_2F'].isin([1, 2])
].copy()
print('Shape after removing entries with missing information: %d' % demo.shape[0])

# remove duplicated entries
demo = demo.set_index('subject_id').groupby('subject_id').max()

print('Shape after removing duplicates: %d' % demo.shape[0])

# convert birth to datetime
demo['date_of_birth'] = pd.to_datetime(demo['date_of_birth'], format='%m/%Y')

# add demographic information to diagnostic data
dx = dx.join(demo, how='left')

# calculate subject age
dx['age'] = (dx.index.get_level_values('diag_date') - dx['date_of_birth']).dt.days / 365.25


In [None]:
display(pd.DataFrame(dx[['diagnosis', 'primary_diagnosis']].value_counts()).sort_index())
display(pd.DataFrame(dx[['diagnosis', 'primary_diagnosis']].isna().sum(), columns=['n_missing']))

In [None]:
# export diagnosis information
dx.to_parquet(
    os.path.join(
        PATH_TO_OUTPUT_DATA, '%s_diagnosis.parquet' % datetime.now().strftime('%Y%m%d'))
)

# Neuropsychological information

In [None]:
# variables of interest
neurobat_variables = {
    # Memory window
    'AVDELTOT'       : 'memory_avlt_recognition',
    'AVTOT1'         : 'memory_avlt_trial_1',
    'AVTOT2'         : 'memory_avlt_trial_2',
    'AVTOT3'         : 'memory_avlt_trial_3',
    'AVTOT4'         : 'memory_avlt_trial_4',
    'AVTOT5'         : 'memory_avlt_trial_5',
    'AVTOT6'         : 'memory_avlt_trial_6',
    'AVDEL30MIN'     : 'memory_avlt_delayed',

    # Language window
    'CATANIMSC'      : 'language_cat_fluency',
    'CATANPERS'      : 'language_cat_perseveration',

    'BNTSPONT': 'language_bnt_tot_correct_no_cue',
    'BNTCSTIM': 'language_bnt_tot_correct_scue',
    'BNTCPHON': 'language_bnt_tot_correct_pcue',
    'BNTTOTAL': 'language_bnt_tot',

    # Executive functions
    'TRAASCOR'       : 'exec_tmt_a_time',
    'TRABSCOR'       : 'exec_tmt_b_time',

    # Visuospatial functioning
    'CLOCKSCOR'      : 'visuos_clock_draw_tot_score',
    'COPYSCOR'       : 'visuos_clock_copy_tot_score',

    # attention
    'DSPANFOR': 'attention_digit_span_forward',
    'DSPANBAC': 'attention_digit_span_backward',
}

adas_1_variables = {
    # Memory window
    'Q1': 'memory_word_recall',
    'Q4': 'memory_word_recall_delayed',
    'Q8': 'memory_word_recognition',
    'Q9': 'memory_remembering_test',

    # Language window
    'Q2': 'language_commands',
    'Q5': 'language_naming',
    'Q12': 'language_comprehension',
    'Q11': 'language_word_finding_diff',
    'Q10': 'language_spoken_language',

    # Visuospatial window
    'Q3': 'visuos_constructional_praxis',
    'Q6': 'visuos_ideational_praxis',

    # Attention window
    'Q14': 'attention_number_cancellation'   # different coding
}

adas_go23_variables = {
    # Memory window
    'Q1SCORE': 'memory_word_recall',
    'Q4SCORE': 'memory_word_recall_delayed',
    'Q8SCORE': 'memory_word_recognition',
    'Q9SCORE': 'memory_remembering_test',

    # Language window
    'Q2SCORE': 'language_commands',
    'Q5SCORE': 'language_naming',
    'Q10SCORE': 'language_comprehension',
    'Q11SCORE': 'language_word_finding_diff',
    'Q12SCORE': 'language_spoken_language',
    
    # Visuospatial window
    'Q3SCORE': 'visuos_constructional_praxis',
    'Q6SCORE': 'visuos_ideational_praxis',
    
    # Attention window
    'Q13SCORE': 'attention_number_cancellation'   # different coding
}

In [None]:
# load NEUROBAT information
neurobat = pd.read_csv(PATH_TO_NEUROBAT)
print('Initial shape: %d' % neurobat.shape[0])

# fill EXAMDATE information
neurobat.loc[neurobat['VISDATE'].isna(), 'VISDATE'] =\
    neurobat.loc[neurobat['VISDATE'].isna(), 'VISDATE']

# remove entries without date
neurobat = neurobat.dropna(subset=['VISDATE', 'PTID'])
print('Shape after removing key variables: %d' % neurobat.shape[0])

# process variables
neurobat['VISDATE'] = pd.to_datetime(neurobat['VISDATE'])
neurobat = neurobat.rename(columns={
    'PTID': 'subject_id',
    'VISDATE': 'neurobat_date'
}).set_index(['subject_id', 'neurobat_date']).sort_index()

# select variables
neurobat = neurobat[list(neurobat_variables.keys())].rename(columns=neurobat_variables, errors='raise')
neurobat = neurobat.dropna(how='all')

# drop duplicates
neurobat = neurobat.sort_index().groupby(['subject_id', 'neurobat_date']).nth(-1)

print('Shape after removing missing in neuro vars: %d' % neurobat.shape[0])

In [None]:
# load ADAS information (ADNI 1)
adas_1 = pd.read_csv(PATH_TO_ADAS_1)
print('Initial shape (ADAS 1): %d' % adas_1.shape[0])

# remove entries without date
adas_1 = adas_1.dropna(subset=['EXAMDATE', 'PTID'])
print('Shape after removing key variables (ADAS 1): %d' % adas_1.shape[0])

# process variables
adas_1['EXAMDATE'] = pd.to_datetime(adas_1['EXAMDATE'])
adas_1 = adas_1.rename(columns={
    'PTID': 'subject_id',
    'EXAMDATE': 'adas_date'
}).set_index(['subject_id', 'adas_date']).sort_index()

adas_1 = adas_1[list(adas_1_variables.keys())].rename(columns=adas_1_variables, errors='raise')
adas_1 = adas_1.dropna(how='all')

print('Shape after removing missing in neuro vars: %d' % adas_1.shape[0])

adas_go23 = pd.read_csv(PATH_TO_ADAS_GO23)
print('Initial shape (ADAS GO,2,3): %d' % adas_go23.shape[0])

# fill EXAMDATE information
adas_go23.loc[adas_go23['DATE'].isna(), 'DATE'] =\
    adas_go23.loc[adas_go23['DATE'].isna(), 'VISDATE']

# remove entries without date
adas_go23 = adas_go23.dropna(subset=['DATE', 'PTID'])
print('Shape after removing key variables (ADAS GO,2,3): %d' % adas_go23.shape[0])

# process variables
adas_go23['DATE'] = pd.to_datetime(adas_go23['DATE'])
adas_go23 = adas_go23.rename(columns={
    'PTID': 'subject_id',
    'DATE': 'adas_date'
}).set_index(['subject_id', 'adas_date']).sort_index()

adas_go23 = adas_go23[list(adas_go23_variables.keys())].rename(columns=adas_go23_variables, errors='raise')
adas_go23 = adas_go23.dropna(how='all')

print('Shape after removing missing in neuro vars: %d' % adas_go23.shape[0])

all_adas = pd.concat([adas_1, adas_go23], axis=0)

assert all_adas.shape[1] == adas_1.shape[1]
assert all_adas.shape[1] == adas_go23.shape[1]

all_adas = all_adas.groupby(['subject_id', 'adas_date']).nth(-1)

assert not all_adas.index.duplicated().any()

print('Shape after merging all the information: %d' % all_adas.shape[0])


In [None]:
# merge ADAS y NEUROBAT (using a 30 day window)
crossed_df = neurobat.join(all_adas, how='left').reset_index(['neurobat_date', 'adas_date'])
crossed_df['_days_diff'] = (crossed_df['neurobat_date'] - crossed_df['adas_date']).dt.days.abs()
crossed_df = crossed_df.reset_index().set_index(['subject_id', 'neurobat_date', '_days_diff']).sort_index()
crossed_df = crossed_df.groupby(['subject_id', 'neurobat_date']).nth(0).reset_index('_days_diff')
crossed_df.loc[crossed_df['_days_diff'] > 60, all_adas.columns] = np.nan
crossed_df = crossed_df.drop(columns=['_days_diff'])

assert not crossed_df.index.duplicated().any()

print('Final dataframe shape: %d' % crossed_df.shape[0])
final_neuro = crossed_df.copy()

In [None]:
# export the data
final_neuro.to_parquet(
    os.path.join(
        PATH_TO_OUTPUT_DATA, '%s_neuropsycho.parquet' % datetime.now().strftime('%Y%m%d'))
)

# MMSE and CDR information

In [None]:
# add MMSE / CDR scores
mmse = pd.read_csv(PATH_TO_MMSE)
print('Initial shape: %d' % mmse.shape[0])

# fill EXAMDATE information
mmse.loc[mmse['VISDATE'].isna(), 'VISDATE'] =\
    mmse.loc[mmse['VISDATE'].isna(), 'USERDATE']

# remove entries without date
mmse = mmse.dropna(subset=['VISDATE', 'PTID'])
print('Shape after removing key variables: %d' % mmse.shape[0])

# process variables
mmse['VISDATE'] = pd.to_datetime(mmse['VISDATE'])
mmse = mmse.rename(columns={
    'PTID': 'subject_id',
    'VISDATE': 'mmse_date',
    'MMSCORE': 'mmse'
}).set_index(['subject_id', 'mmse_date']).sort_index()
mmse = mmse[['mmse']].copy()
mmse = mmse.dropna(how='all')

# drop duplicates
mmse = mmse.sort_index().groupby(['subject_id', 'mmse_date']).nth(-1)

print('Shape after removing missing in neuro vars: %d' % mmse.shape[0])

In [None]:
cdr = pd.read_csv(PATH_TO_CDR)
print('Initial shape: %d' % cdr.shape[0])

# remove entries without date
cdr = cdr.dropna(subset=['VISDATE', 'PTID'])
print('Shape after removing key variables: %d' % cdr.shape[0])

# process variables
cdr['VISDATE'] = pd.to_datetime(cdr['VISDATE'])
cdr = cdr.rename(columns={
    'PTID': 'subject_id',
    'VISDATE': 'cdr_date',
    'CDRSB': 'cdr'
}).set_index(['subject_id', 'cdr_date']).sort_index()
cdr = cdr[['cdr']].copy()
cdr = cdr.dropna(how='all')

# drop duplicates
cdr = cdr.sort_index().groupby(['subject_id', 'cdr_date']).nth(-1)

print('Shape after removing missing in neuro vars: %d' % cdr.shape[0])

In [None]:
# merge ADAS y NEUROBAT (using a 30 day window)
mmse_cdr_df = mmse.join(cdr).reset_index(['mmse_date', 'cdr_date'])
mmse_cdr_df['_days_diff'] = (mmse_cdr_df['mmse_date'] - mmse_cdr_df['cdr_date']).dt.days.abs()
mmse_cdr_df = mmse_cdr_df.reset_index().set_index(['subject_id', 'mmse_date', '_days_diff']).sort_index()
mmse_cdr_df = mmse_cdr_df.groupby(['subject_id', 'mmse_date']).nth(0).reset_index('_days_diff')
mmse_cdr_df.loc[mmse_cdr_df['_days_diff'] > 60, cdr.columns] = np.nan
mmse_cdr_df = mmse_cdr_df.drop(columns=['_days_diff'])

assert not mmse_cdr_df.index.duplicated().any()

print('Final dataframe shape: %d' % mmse_cdr_df.shape[0])

In [None]:
# export the data
mmse_cdr_df.to_parquet(
    os.path.join(
        PATH_TO_OUTPUT_DATA, '%s_mmse_cdr.parquet' % datetime.now().strftime('%Y%m%d'))
)

# Neuroimaging metadata

In [None]:
# information about amyloid QC
amy_qc = pd.read_csv(PATH_TO_AMY_QC1)
amy_qc['SCANDATE'] = pd.to_datetime(amy_qc['SCANDATE'])
amy_qc = amy_qc.set_index(['PTID', 'SCANDATE'])
amy_qc = amy_qc[['SCANQLTY']]
amy_qc.index.names = ['subject_id', 'amy_scan_date']
amy_qc = amy_qc.rename(columns={'SCANQLTY': 'pass_amy_qc'})

assert not amy_qc.isna().any().any()

# information about AV45 QC
av45_qc = pd.read_csv(PATH_TO_AMY_QC2)
av45_qc['EXAMDATE'] = pd.to_datetime(av45_qc['EXAMDATE'])
av45_qc = av45_qc.set_index(['PTID', 'EXAMDATE'])
av45_qc = av45_qc[['PASS']]
av45_qc.index.names = ['subject_id', 'amy_scan_date']
av45_qc = av45_qc.rename(columns={'PASS': 'pass_amy_qc'})

assert not av45_qc.isna().any().any()

amy_qc = pd.concat([amy_qc, av45_qc], axis=0)

amy_qc = amy_qc.groupby(['subject_id', 'amy_scan_date']).max()

print(f'Number of entries in Amyloid QC: {amy_qc.shape[0]} (passed QC: {(amy_qc["pass_amy_qc"].mean() * 100):.1f})')

In [None]:
# export the data
amy_qc.to_parquet(
    os.path.join(
        PATH_TO_OUTPUT_DATA, '%s_amy_qc.parquet' % datetime.now().strftime('%Y%m%d'))
)

In [None]:
# information about FDG QC
fdg_qc = pd.read_csv(PATH_TO_FDG_QC1)
fdg_qc['SCANDATE'] = pd.to_datetime(fdg_qc['SCANDATE'])
fdg_qc = fdg_qc.set_index(['PTID', 'SCANDATE'])
fdg_qc = fdg_qc[['SCANQLTY']]
fdg_qc.index.names = ['subject_id', 'fdg_scan_date']
fdg_qc = fdg_qc.rename(columns={'SCANQLTY': 'pass_fdg_qc'})

assert not fdg_qc.isna().any().any()

# information about fdg_a4 QC
fdg_a4_qc = pd.read_csv(PATH_TO_FDG_QC2)
fdg_a4_qc['EXAMDATE'] = pd.to_datetime(fdg_a4_qc['EXAMDATE'])
fdg_a4_qc = fdg_a4_qc.set_index(['PTID', 'EXAMDATE'])
fdg_a4_qc = fdg_a4_qc[['PASS']]
fdg_a4_qc.index.names = ['subject_id', 'fdg_scan_date']
fdg_a4_qc = fdg_a4_qc.rename(columns={'PASS': 'pass_fdg_qc'})

assert not fdg_a4_qc.isna().any().any()

fdg_qc = pd.concat([fdg_qc, fdg_a4_qc], axis=0)

fdg_qc = fdg_qc.groupby(['subject_id', 'fdg_scan_date']).max()

print(f'Number of entries in fdg QC: {fdg_qc.shape[0]} (passed QC: {(fdg_qc["pass_fdg_qc"].mean() * 100):.1f})')

In [None]:
# export the data
fdg_qc.to_parquet(
    os.path.join(
        PATH_TO_OUTPUT_DATA, '%s_fdg_qc.parquet' % datetime.now().strftime('%Y%m%d'))
)