Notebook used for the calculation of the composite scores.

In [None]:
import os 
import sys
from pathlib import Path
import importlib

PATH_TO_ROOT = Path(os.path.join('..'))

# add paths to internal libraries
sys.path.append(str(PATH_TO_ROOT / 'src'))

from utils.variables import GOJO_VERSION

# import gojo modules
gojo = importlib.import_module(GOJO_VERSION)

import warnings
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.api as sm
import statsmodels.formula.api as smf
from datetime import datetime
from scipy.interpolate import PchipInterpolator
from sklearn.linear_model import LinearRegression
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, explained_variance_score

In [None]:
# path to the data processed using "Pipeline - Clinical data.ipynb"
PATH_TO_STUDY_DATA = PATH_TO_ROOT / 'data' / 'adni' / 'processed'
PATH_TO_NEUROPSYCHO_DATA = PATH_TO_STUDY_DATA / '20240428_neuropsycho.parquet'
PATH_TO_NEUROLOGY_DATA = PATH_TO_STUDY_DATA / '20240428_diagnosis.parquet'
PATH_TO_MMSE_CDR_DATA = PATH_TO_STUDY_DATA / '20240428_mmse_cdr.parquet'
PATH_TO_AMY_STATUS_DATA = PATH_TO_STUDY_DATA / '..' / 'download' / 'UCBERKELEY_AMY_6MM_24Apr2024.csv'

# composite structure
SEM_STRUCTURE = {
    
    'memory': [
        'memory_avlt_trial_1',
        'memory_avlt_trial_2',
        #'memory_avlt_trial_3',
        #'memory_avlt_trial_4',
        #'memory_avlt_trial_5',
        'memory_avlt_trial_6',
        'memory_avlt_delayed',
        'memory_avlt_recognition',
        'memory_word_recognition',
        'memory_word_recall_delayed',
        'memory_word_recall',
        #'memory_remembering_test',
    ],
    'language': [
        'language_cat_fluency',
        'language_naming', 
        'language_bnt_tot',
        'language_word_finding_diff',
    ],
    'exec': [
        'exec_tmt_a_time',
        'exec_tmt_b_time',
        # 'attention_digit_span_forward_binary',
        # 'attention_digit_span_backward_binary',
        'attention_number_cancellation'
    ],
    'visuospatial': [

        'visuos_clock_copy_tot_score',
        'visuos_clock_draw_tot_score',
        'visuos_constructional_praxis',
        'visuos_ideational_praxis',
    ]
}

# parameters used to perform the imputation
IMPUATION_MODEL = {
    'model': RandomForestRegressor,
    'model_params': dict(
        n_estimators=400, 
        max_depth=5,
        min_samples_split=50,
        min_samples_leaf=20,
        max_features=0.75,
        max_samples=0.75,
        bootstrap=True,
        n_jobs=20,
        random_state=1997
    ),
    'imputation_params': dict(
        max_iter=50,
        initial_strategy='median'
    ) 
}


# variables that will not be rounded to the nearest integer after imputation
CONTINUOUS_VARIABLES = [
    'exec_tmt_a_time',
    'exec_tmt_b_time',
]


In [None]:
def crossNearestInfo(
    target_df: pd.DataFrame,
    source_df: pd.DataFrame,
    how: str,
    window: int = None
) -> pd.DataFrame:
    """ Function used to cross dataframes by nearest date """
    # get date columns
    subject_id = target_df.index.names[0]
    target_date = target_df.index.names[1]
    source_date = source_df.index.names[1]
    
    # cross information
    crossed_df = target_df.join(source_df, how=how).reset_index([target_date, source_date])
    
    # select information in window
    if window:
        crossed_df = crossed_df.loc[(crossed_df[target_date] - crossed_df[source_date]).dt.days.abs() < window].copy()
    
    # remove possible duplicates
    crossed_df['_days_diff'] = (crossed_df[target_date] - crossed_df[source_date]).dt.days.abs()
    crossed_df = crossed_df.reset_index().set_index([subject_id, target_date, '_days_diff']).sort_index()
    crossed_df = crossed_df.groupby([subject_id, target_date]).nth(0)
    crossed_df = crossed_df.reset_index('_days_diff').drop(columns=['_days_diff']).copy()
    
    assert not crossed_df.index.duplicated().any()

    return crossed_df


def getValsToImpute(arr: np.ndarray):
    """ Subroutine used to calculate the vals to be imputed """

    length = len(arr)
    
    # forward pass
    vals_to_impute = []
    start_to_imputate = False
    for i in range(length):
        if np.isnan(arr[i]) and not start_to_imputate:
            vals_to_impute.append(False)
            continue

        vals_to_impute.append(True)
        start_to_imputate = True

    # backward pass
    start_to_imputate = False
    for i in range(length):
        curr_idx = length - i - 1
        if not vals_to_impute[curr_idx]:
            continue
            
        if np.isnan(arr[curr_idx]) and not start_to_imputate:
            vals_to_impute[curr_idx] = False
            continue

        if np.isnan(arr[curr_idx]):
            vals_to_impute[curr_idx] = True
            
        start_to_imputate = True

    # non-missing values won't be imputed
    for i in range(length):
        if not np.isnan(arr[i]):
            vals_to_impute[i] = False
            

    return np.array(vals_to_impute)


def interpolateMidPoints(
        df: pd.DataFrame, 
        var: str, 
        unique_ids: np.ndarray, 
        display_plots: int = 0
):
    """ Subroutine used to interpolate missing values """
    df = df.copy()
    
    # make an interpolation of the values that are in the middle of the trayectory
    for idx, subid in enumerate(unique_ids):
        sub_data = df.loc[subid, [var, 'years_followup_neurobat']].values
        bnt_vals = sub_data[:, 0]
        time_step = sub_data[:, 1]
        na_mask = np.isnan(bnt_vals)
    
        # when all values are known continue (better prevent)
        if np.all(~na_mask):
            continue
            
        # when there is less than 4 values continue
        if (~na_mask).sum() < 3:
            continue
    
        # check for valid values to imputate
        vals_to_imputate = getValsToImpute(bnt_vals)
        if np.all(vals_to_imputate == False):
            continue
    
        # adjust the spline (monotonic interpolation)
        spl = PchipInterpolator(time_step[~na_mask], bnt_vals[~na_mask])
        bnt_vals[vals_to_imputate] = spl(time_step[vals_to_imputate])
        df.loc[subid, var] = bnt_vals

        if idx < display_plots:
            fig, ax = plt.subplots(figsize=(5, 2.5))
            plt.scatter(
                time_step,
                bnt_vals)
            
            plt.scatter(
                time_step[vals_to_imputate],
                spl(time_step[vals_to_imputate]),
                color='red', s=75
            )
            plt.plot(
                time_step,
                bnt_vals)
            ax.spines['top'].set_visible(False)
            ax.spines['right'].set_visible(False)
            ax.grid(alpha=0.3, color='black')
            ax.set_xlabel('Time')
            ax.set_ylabel('Score')
            ax.set_title(var)
            plt.show()

    return df


# Data processing

In [None]:
assert os.path.exists(PATH_TO_NEUROPSYCHO_DATA), 'PATH_TO_NEUROPSYCHO_DATA not found'
assert os.path.exists(PATH_TO_NEUROLOGY_DATA), 'PATH_TO_NEUROLOGY_DATA not found'
assert os.path.exists(PATH_TO_MMSE_CDR_DATA), 'PATH_TO_MMSE_CDR_DATA not found'

# read the data
neuro = pd.read_parquet(PATH_TO_NEUROPSYCHO_DATA)
diag = pd.read_parquet(PATH_TO_NEUROLOGY_DATA)
mmse_cdr = pd.read_parquet(PATH_TO_MMSE_CDR_DATA)

# merge neuropsycho and diagnostic information
neuro = crossNearestInfo(target_df=neuro, source_df=diag, how='left', window=30*3)

# add the MMSE/CDR information
neuro = crossNearestInfo(target_df=neuro, source_df=mmse_cdr, how='left', window=30*3)

# calculate the follow-up time
neuro['_date'] = neuro.index.get_level_values('neurobat_date')
neuro['years_followup_neurobat'] = neuro.groupby('subject_id')['_date'].diff().dt.days.fillna(0.0).groupby('subject_id').cumsum() / 365
neuro = neuro.drop(columns=['_date'])

# select the variables used for the composites creation
composite_source_vars = [test for tests in SEM_STRUCTURE.values() for test in tests]

In [None]:

# --- Process memory window
neuro.loc[neuro.memory_avlt_recognition == -1, 'memory_avlt_recognition'] = np.nan
neuro.loc[neuro.memory_avlt_trial_1 == -1, 'memory_avlt_trial_1'] = np.nan
neuro.loc[neuro.memory_avlt_trial_2 == -1, 'memory_avlt_trial_2'] = np.nan
neuro.loc[neuro.memory_avlt_trial_6 == -1, 'memory_avlt_trial_6'] = np.nan
neuro.loc[neuro.memory_avlt_delayed == -1, 'memory_avlt_delayed'] = np.nan
neuro.loc[neuro.memory_word_recall < 0, 'memory_word_recall'] = np.nan
neuro.loc[neuro.memory_word_recall_delayed < 0, 'memory_word_recall_delayed'] = np.nan
neuro.loc[neuro.memory_word_recognition < 0, 'memory_word_recognition'] = np.nan

neuro['memory_word_recognition'] = -1*neuro['memory_word_recognition']
neuro['memory_word_recall_delayed'] = -1*neuro['memory_word_recall_delayed']
neuro['memory_word_recall'] = -1*neuro['memory_word_recall']


# --- Process language window
neuro.loc[neuro.language_naming < 0, 'language_naming'] = np.nan
neuro.loc[neuro.language_cat_fluency < 0, 'language_cat_fluency'] = np.nan
neuro.loc[neuro.language_word_finding_diff < 0, 'language_word_finding_diff'] = np.nan
neuro.loc[neuro.language_bnt_tot < 0, 'language_bnt_tot'] = np.nan

# Convert variable to categorical with ordinal categories of (2 = no difficulties, 1 = some difficulties, 0 = difficulties)
neuro.loc[neuro['language_naming'] == 0.0, 'language_naming'] = 0
neuro.loc[neuro['language_naming'] == 1.0, 'language_naming'] = 1
neuro.loc[neuro['language_naming'] > 1.0, 'language_naming'] = 2
neuro['language_naming'] = (-1 * neuro['language_naming']) + 2
neuro['language_word_finding_diff'] = -1*neuro['language_word_finding_diff']


# --- Process executive window
neuro.loc[neuro['exec_tmt_a_time']  < 10, 'exec_tmt_a_time'] = np.nan
neuro.loc[neuro['exec_tmt_b_time']  < 10, 'exec_tmt_b_time'] = np.nan
neuro.loc[neuro.attention_number_cancellation < 0, 'attention_number_cancellation'] = np.nan

# convert TMT times to log scale
neuro['exec_tmt_a_time'] = np.log(neuro['exec_tmt_a_time'])
neuro['exec_tmt_b_time'] = np.log(neuro['exec_tmt_b_time'])

# invert TMT tests
neuro['exec_tmt_a_time'] = -1*neuro['exec_tmt_a_time']
neuro['exec_tmt_b_time'] = -1*neuro['exec_tmt_b_time']


# invert ADAS-COG number cancellation
neuro['attention_number_cancellation'] = (-1*neuro['attention_number_cancellation'] + 5)


# --- Process visuos window
neuro.loc[neuro.visuos_constructional_praxis < 0, 'visuos_constructional_praxis'] = np.nan
neuro.loc[neuro.visuos_clock_copy_tot_score < 0, 'visuos_clock_copy_tot_score'] = np.nan
neuro.loc[neuro.visuos_clock_draw_tot_score < 0, 'visuos_clock_draw_tot_score'] = np.nan
neuro.loc[neuro.visuos_ideational_praxis < 0, 'visuos_ideational_praxis'] = np.nan
# invert praxis scores
neuro['visuos_constructional_praxis'] = (-1 * neuro['visuos_constructional_praxis']) + 5
neuro['visuos_ideational_praxis'] = (-1 * neuro['visuos_ideational_praxis']) + 5

In [None]:
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    for domain, tests in SEM_STRUCTURE.items():
        
        # display composites distribution
        n_subplots = len(tests)
        num_rows = (n_subplots + 2) // 3
        fig, axes = plt.subplots(num_rows, 3, figsize=(14, 3.5 * num_rows))
        axes = axes.flatten() if num_rows > 0 else [axes]
        for i, (var, ax) in enumerate(zip(tests, axes)):
            sns.boxplot(
                data=neuro,
                y=var,
                hue='diagnosis',
                hue_order=['control', 'mci', 'dementia'],
                ax=ax,
                gap=0.25,
                palette='Blues'
            )
            sns.move_legend(
                ax, "lower center",
                bbox_to_anchor=(.5, -0.25), ncol=3, title=None, frameon=False,
            )
            ax.spines['top'].set_visible(False)
            ax.spines['right'].set_visible(False)
            ax.grid(alpha=.3, color='black')
            ax.set_xlabel('')
            ax.set_ylabel('score')
            ax.set_title(var, size=14, pad=15)
            
        for j in range(i + 1, num_rows * 3):
            fig.delaxes(axes[j])
        
        plt.subplots_adjust(hspace=0.75, wspace=0.5)
        plt.show()
        print('\n')
        

In [None]:
neuro[composite_source_vars].describe().loc[[
    'min', 'max', 'mean', 'std'
]]


In [None]:
fig, ax = plt.subplots(figsize=(14, 8))
sns.heatmap(
    data=neuro[composite_source_vars].corr(),
    annot=True, fmt=".2f",
    vmin=0, vmax=1,
    cmap='Spectral_r'
)

# group test by cognitive domain
groups = {}
i = 0
for domain, tests in SEM_STRUCTURE.items():
    groups[domain] = (i, i+len(tests)-1)
    i += len(tests)
    
for group, (start, end) in groups.items():
    # Añadir un rectángulo alrededor del grupo
    rect = plt.Rectangle((start, start), end-start+1, end-start+1, fill=False, edgecolor='black', lw=2)
    ax.add_patch(rect)


plt.show()

# Missing value imputation

In [None]:
# save the original data for calculating the composite scores
neuro_no_impt = neuro.copy()

# missing values
(pd.DataFrame(
    neuro[composite_source_vars].isna().sum(), columns=['perc_missing']
).sort_values(by='perc_missing', ascending=False) / neuro.shape[0] * 100).round(2)

In [None]:
# Perform a monotonic B-spline interpolation of the intermediate missing values
for idx, var in enumerate(composite_source_vars):
    subject_ids = neuro.loc[neuro[var].isna()].index.get_level_values('subject_id').unique().values
    neuro = interpolateMidPoints(
        df=neuro,
        var=var,
        unique_ids=subject_ids,
        display_plots=3
    )

In [None]:
# missing values
(pd.DataFrame(
    neuro[composite_source_vars].isna().sum(), columns=['perc_missing']
).sort_values(by='perc_missing', ascending=False) / neuro.shape[0] * 100).round(2)

In [None]:
# separate the entries with missing values in the BNT
neuro_bnt_nans_index = neuro.loc[
    neuro.language_bnt_tot.isna()
].index

# add diagnosis as categorical variable
neuro = neuro.join(pd.get_dummies(neuro['diagnosis'], drop_first=True).astype(int))

np.random.seed(1997)

# create imputation instance
imputer = IterativeImputer(
    estimator=IMPUATION_MODEL['model'](**IMPUATION_MODEL['model_params']),
    min_value=neuro[composite_source_vars + ['age', 'yschooling', 'mci', 'dementia']].min(),
    max_value=neuro[composite_source_vars + ['age', 'yschooling', 'mci', 'dementia']].max(),
    random_state=1997,
    verbose=1,
    **IMPUATION_MODEL['imputation_params']
)

# perform imputation
neuro_imputed = imputer.fit_transform(neuro[composite_source_vars + ['age', 'yschooling', 'mci', 'dementia']])
neuro_imputed = pd.DataFrame(neuro_imputed, columns=composite_source_vars + ['age', 'yschooling', 'mci', 'dementia'])
neuro[composite_source_vars] = neuro_imputed[composite_source_vars].values

In [None]:
# convert non-float variables to integer
for var in composite_source_vars:
    if var not in CONTINUOUS_VARIABLES:
        neuro[var] = neuro[var].round(0).astype(int)

### Special case: BNT imputation

In [None]:
# select BNT subjects as NaNs
neuro.loc[neuro_bnt_nans_index, 'language_bnt_tot'] = np.nan

In [None]:
# create the model used to imputate the BNT
model = gojo.interfaces.SklearnModelWrapper(
    model_class=IMPUATION_MODEL['model'],
    **IMPUATION_MODEL['model_params']
)
model.updateParameters(n_jobs=1)    # select only one worker
model

In [None]:
# select the X and y data
X = neuro.loc[~neuro['language_bnt_tot'].isna()][
    [v for v in composite_source_vars if v != 'language_bnt_tot'] + [
     'age', 'yschooling', 'mci', 'dementia']
]
y = neuro.loc[~neuro['language_bnt_tot'].isna()]['language_bnt_tot']

In [None]:
# evaluate the model via cross validation to obtain an estimate of the expected error
# during imputation
cv_report = gojo.core.evalCrossVal(
    X=X,
    y=y,
    model=model,
    cv=gojo.util.splitter.InstanceLevelKFoldSplitter(
        n_splits=10, n_repeats=5, instance_id=X.index.get_level_values('subject_id').values),
    n_jobs=10,
    save_train_preds=True
)

In [None]:
# plot predictions vs true values
sns.lmplot(
    data=cv_report.getTestPredictions(),
    y='pred_labels',
    x='true_labels',
    aspect=1.25,
    scatter_kws=dict(color='grey', s=20, alpha=0.2),
    line_kws=dict(color='red', lw=2),
    height=3
)
plt.show()

In [None]:
# display performance on the test set
cv_report.getScores(
    gojo.core.getDefaultMetrics('regression'), supress_warnings=True)['test'].drop(columns=['n_fold']).round(decimals=3).mean()

In [None]:
# display performance on the training set
cv_report.getScores(
    gojo.core.getDefaultMetrics('regression'), supress_warnings=True)['train'].drop(columns=['n_fold']).round(decimals=3).mean()

In [None]:
# fit the model to predict subjects without any existing value in the BNT
model.train(X, y)
pred_bnt_scores = model.performInference(
    neuro.loc[neuro['language_bnt_tot'].isna()][
        [v for v in composite_source_vars if v != 'language_bnt_tot'] + [
         'age', 'yschooling', 'mci', 'dementia']
    ]
)

# save the predictions in another different variable
neuro['language_bnt_tot_rf_pred'] = np.nan
neuro.loc[neuro['language_bnt_tot'].isna(), 'language_bnt_tot_rf_pred'] = pred_bnt_scores

In [None]:
# fit a LMM model to the longitudinal data
lmm_input = neuro.loc[~neuro['language_bnt_tot'].isna()][[
    'language_bnt_tot',
    'age',
    'yschooling',
    'dementia',
    'mci',
    'years_followup_neurobat',
]].copy()
lmm_input['age'] = lmm_input['age'] -  lmm_input['years_followup_neurobat']   # remove time effect from the age

# adjustr the LMM model
lmm_model = smf.mixedlm(
    """ language_bnt_tot ~ 1 + years_followup_neurobat +
        years_followup_neurobat*dementia +
        years_followup_neurobat*mci +
        age + 
        yschooling
    """, 
    re_formula="~ 1 + years_followup_neurobat",
    data=lmm_input.reset_index(), 
    groups=lmm_input.reset_index()["subject_id"],
    missing='drop'
)
lmm_result = lmm_model.fit(method=["lbfgs"])

# extract the model coefficients
model_coefs = lmm_result.params

In [None]:
print('R^2: {:.3f}'.format(r2_score(lmm_input['language_bnt_tot'], lmm_result.fittedvalues)))
lmm_result.summary()


In [None]:
# perform the imputation for each subject
sub_ids = neuro.loc[neuro['language_bnt_tot'].isna()].index.get_level_values('subject_id').unique()

neuro['language_bnt_tot_imputed'] = np.nan
max_display = 10
display_count = 0
for subid in sub_ids:
    true_vals = neuro.loc[subid, 'language_bnt_tot'].copy().values
    pred_vals = neuro.loc[subid, 'language_bnt_tot_rf_pred'].copy().values

    # imputate the first value using the predicted value of the model
    if np.all(np.isnan(true_vals)):
        true_vals[0] = pred_vals[0]

    # impute the values using the expected decay based on the LMM parameters
    if not np.all(~np.isnan(true_vals)):
        first_nan_idx = np.where(np.isnan(true_vals))[0][0]
        true_vals[first_nan_idx:] = true_vals[first_nan_idx-1] + (
            model_coefs.loc['years_followup_neurobat'] * neuro.loc[subid, 'years_followup_neurobat'].values[first_nan_idx:] + 
            model_coefs.loc['dementia'] * neuro.loc[subid, 'dementia'].values[first_nan_idx:] +
            model_coefs.loc['mci'] * neuro.loc[subid, 'mci'].values[first_nan_idx:] +
            model_coefs.loc['years_followup_neurobat:dementia'] * neuro.loc[subid, 'dementia'].values[first_nan_idx:] * neuro.loc[subid, 'years_followup_neurobat'].values[first_nan_idx:] +
            model_coefs.loc['years_followup_neurobat:mci'] * neuro.loc[subid, 'mci'].values[first_nan_idx:] * neuro.loc[subid, 'years_followup_neurobat'].values[first_nan_idx:]
        )
        
    # save the imputed values        
    neuro.loc[subid, 'language_bnt_tot_imputed'] = true_vals

    if display_count < max_display and np.random.uniform() > 0.5:
        display_count += 1
        fig, ax = plt.subplots(figsize=(4, 2))
        
        ax.plot(
            neuro.loc[subid, 'years_followup_neurobat'].values,
            neuro.loc[subid, 'language_bnt_tot_imputed'], color='orange'
        )
        ax.scatter(
            neuro.loc[subid, 'years_followup_neurobat'].values,
            neuro.loc[subid, 'language_bnt_tot_imputed'], color='orange'
        )
        ax.plot(
            neuro.loc[subid, 'years_followup_neurobat'].values,
            neuro.loc[subid, 'language_bnt_tot'].values,
            color='blue'
        )
        ax.scatter(
            neuro.loc[subid, 'years_followup_neurobat'].values,
            neuro.loc[subid, 'language_bnt_tot'].values,
            color='blue'
        )
        ax.grid(alpha=0.3, color='black')
        for pos in ['left', 'right', 'bottom', 'top']:
            ax.spines[pos].set_visible(False)
        ax.set_xlabel('Years')
        ax.set_ylabel('BNT')
        ax.set_ylim(-1, 31)
        plt.show()

# adjust imputed values range
neuro.loc[neuro['language_bnt_tot_imputed'] > 30, 'language_bnt_tot_imputed'] = 30
neuro.loc[neuro['language_bnt_tot_imputed'] < 0, 'language_bnt_tot_imputed'] = 0
neuro.loc[~neuro['language_bnt_tot'].isna(), 'language_bnt_tot_imputed'] =\
    neuro.loc[~neuro['language_bnt_tot'].isna(), 'language_bnt_tot'] 

In [None]:
# display model predictions vs imputed values 
sns.lmplot(
    data=neuro,
    y='language_bnt_tot_rf_pred',
    x='language_bnt_tot_imputed',
    aspect=1.25,
    scatter_kws=dict(color='grey', s=20, alpha=0.2),
    line_kws=dict(color='red', lw=2),
    height=3
)
plt.title('Correlation {:.3f}'.format(
    neuro.loc[neuro['language_bnt_tot'].isna()][[
        'language_bnt_tot_imputed', 'language_bnt_tot_rf_pred']].corr().values[0, 1]))
plt.show()

In [None]:
# replace the BNT original version with the imputed one
neuro['language_bnt_tot_original'] = neuro['language_bnt_tot']
neuro['language_bnt_tot'] = neuro['language_bnt_tot_imputed']

In [None]:
# missing values
pd.DataFrame(
    neuro[composite_source_vars].isna().sum(), columns=['perc_missing']
).sort_values(by='perc_missing', ascending=False) / neuro.shape[0] * 100

## Data visualization

In [None]:
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    for domain, tests in SEM_STRUCTURE.items():
        
        # display composites distribution
        n_subplots = len(tests)
        num_rows = (n_subplots + 2) // 3
        fig, axes = plt.subplots(num_rows, 3, figsize=(14, 3.5 * num_rows))
        axes = axes.flatten() if num_rows > 0 else [axes]
        for i, (var, ax) in enumerate(zip(tests, axes)):
            sns.boxplot(
                data=neuro,
                y=var,
                hue='diagnosis',
                hue_order=['control', 'mci', 'dementia'],
                ax=ax,
                gap=0.25,
                palette='Blues'
            )
            sns.move_legend(
                ax, "lower center",
                bbox_to_anchor=(.5, -0.25), ncol=3, title=None, frameon=False,
            )
            ax.spines['top'].set_visible(False)
            ax.spines['right'].set_visible(False)
            ax.grid(alpha=.3, color='black')
            ax.set_xlabel('')
            ax.set_ylabel('score')
            ax.set_title(var, size=14, pad=15)
            
        for j in range(i + 1, num_rows * 3):
            fig.delaxes(axes[j])
        
        plt.subplots_adjust(hspace=0.75, wspace=0.5)
        plt.show()
        print('\n')

In [None]:
fig, ax = plt.subplots(figsize=(14, 8))
sns.heatmap(
    data=neuro[composite_source_vars].corr(),
    annot=True, fmt=".2f",
    vmin=0, vmax=1,
    cmap='Spectral_r'
)

# group test by cognitive domain
groups = {}
i = 0
for domain, tests in SEM_STRUCTURE.items():
    groups[domain] = (i, i+len(tests)-1)
    i += len(tests)
    
for group, (start, end) in groups.items():
    # Añadir un rectángulo alrededor del grupo
    rect = plt.Rectangle((start, start), end-start+1, end-start+1, fill=False, edgecolor='black', lw=2)
    ax.add_patch(rect)


plt.show()

In [None]:
# fix variables with a low number of levels
original_neuro = neuro_no_impt[composite_source_vars].dropna().sort_index().groupby('subject_id').nth(-1).copy()
original_neuro.loc[original_neuro['visuos_constructional_praxis'] < 1.5, 'visuos_constructional_praxis'] = 2.0
neuro.loc[neuro['visuos_constructional_praxis'] < 1.5, 'visuos_constructional_praxis'] = 2.0

# export the generated data to calculate the composites using R. Use the data without imputation for model fitting
original_neuro.to_parquet(
    os.path.join(
        PATH_TO_STUDY_DATA, '%s_neuropsycho_inputSEM_last.parquet' % datetime.now().strftime('%Y%m%d'))
)

neuro[composite_source_vars].sort_index().to_parquet(
    os.path.join(
        PATH_TO_STUDY_DATA, '%s_neuropsycho_inputSEM_allObs.parquet' % datetime.now().strftime('%Y%m%d'))
)

# Composites calculation

## Data processing

In [None]:
# load the calculated composites
composites = pd.read_parquet(
    os.path.join(
        PATH_TO_STUDY_DATA, '20240428_neuropsycho_lavaan_SEM.parquet'
    )
)

# format the loaded dataframe
composites['neurobat_date'] = pd.to_datetime(composites['neurobat_date'].apply(lambda v: str(v)[:11]))
composites = composites.set_index(['subject_id', 'neurobat_date']).sort_index()
composites.columns = ['%s_composite' % c for c in composites.columns]

# add the composites to the neuropsychological data
neuro = neuro.join(composites, how='inner')

### Normative data adjustment

In [None]:
# -- adjust the values by age and education level using normative information (controls that mantain their diagnosis)
# select normative subjects (baseline diagnosis control, last diagnosis control, and at least 2 years of follow-up)
# with negative amyloid status

# select subjects with negative amyloid status based on:
# https://doi.org/10.1016/j.jalz.2012.10.007
uc_amy_status = pd.read_csv(PATH_TO_AMY_STATUS_DATA, low_memory=False)
uc_amy_status = uc_amy_status[['PTID', 'SCANDATE', 'AMYLOID_STATUS_COMPOSITE_REF', 'SUMMARY_SUVR']].copy()
uc_amy_status['SCANDATE'] = pd.to_datetime(uc_amy_status['SCANDATE'])
uc_amy_status = uc_amy_status.rename(columns={'PTID': 'subject_id', 'SCANDATE': 'date'}).set_index(['subject_id', 'date'])
uc_amy_status_last = uc_amy_status.sort_index().groupby('subject_id').nth(-1)
amy_neg_subject_ids = uc_amy_status_last\
    .loc[uc_amy_status_last['AMYLOID_STATUS_COMPOSITE_REF'] < 0.5]\
    .index.get_level_values('subject_id').unique()
print(f'Number of subjects with a negative amyloid status: {len(amy_neg_subject_ids)}')

# select baseline and last information
base_neuro = neuro.sort_index().groupby('subject_id').nth(0).reset_index('neurobat_date')
last_neuro = neuro.sort_index().groupby('subject_id').nth(-1).reset_index('neurobat_date')
base_neuro.columns = ['base_%s' % c for c in base_neuro.columns]
last_neuro.columns = ['last_%s' % c for c in last_neuro.columns]
base_last_neuro = base_neuro.join(last_neuro, how='inner')

# remove subjects with less than 2 years of follow-up
base_last_neuro = base_last_neuro.loc[
    (base_last_neuro['last_diag_date'] - base_last_neuro['base_diag_date']).dt.days.abs() >= 365*2
].copy()

print('Number of subjects with al least 2 years of follow-up: %d' % len(base_last_neuro))

# select subjects that mantain a healthy diagnosis with a CDR == 0 and
# select last evaluation and mmse >= 26 (https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6949533/)
normative_subjects = base_last_neuro.loc[
    base_last_neuro['base_diagnosis'].isin(['control']) &
    base_last_neuro['last_diagnosis'].isin(['control']) &
    base_last_neuro['base_cdr'].isin([0.0]) &
    base_last_neuro['last_cdr'].isin([0.0]) &
    (base_last_neuro['last_mmse'] >= 26) & 
    base_last_neuro.index.isin(amy_neg_subject_ids)
].copy()

print('Number of control subjects used to adjust cognitive neuro: %d' % len(normative_subjects))

# save information about amyloid status
neuro['uc_berkley_negative_amyloid_1Y'] = np.nan
neuro.loc[
    neuro.index.get_level_values('subject_id').isin(normative_subjects.index.unique()), 
    'uc_berkley_negative_amyloid_1Y'] = 1

In [None]:
# center independent variables
Xvar = normative_subjects[['last_age', 'last_yschooling']].copy()
Xvar_mean = Xvar.mean()
Xvar = Xvar - Xvar_mean
Xvar = sm.add_constant(Xvar)

# adjust variables
for composite in composites.columns:
    print('\n =============== Composite: "%s"\n' % composite)
    
    base_composite = 'base_%s' % composite
    last_composite = 'last_%s' % composite
    
    # adjust the GLM model
    glm_out = sm.GLM(
        normative_subjects[last_composite], Xvar,
        family=sm.families.Gaussian()
    ).fit()
    
    # display summary
    print(glm_out.summary())
    print('----- Initial correlation\n')
    display(
        normative_subjects[[last_composite, 'last_age', 'last_yschooling']].corr().round(decimals=3)
    )
    
    # plot initial composite distribution
    fig, ax = plt.subplots(figsize=(5, 3))
    ax.hist(neuro[composite].values, label='Raw', alpha=0.3)
    
    # make the adjustment
    neuro[composite] -= (
        (neuro[['age', 'yschooling']] - Xvar_mean.values) * 
        glm_out.params[['last_age', 'last_yschooling']].values
    ).sum(axis=1)
    
    # plot the adjusted distribution
    ax.hist(neuro[composite].values, label='Adj', alpha=0.3)
    ax.legend()
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    ax.grid(alpha=0.3)
    ax.set_xlabel('Value')
    ax.set_ylabel('Freq')
    ax.set_title(composite)
    plt.show()
    
    print('----- Final correlation\n')
    last_normative_neuro_adjusted = neuro.loc[
        normative_subjects.index.get_level_values('subject_id')
    ].copy().groupby('subject_id').nth(-1)
    display(
        last_normative_neuro_adjusted[[composite, 'age', 'yschooling']].corr().round(decimals=3)
    )
    print('\n')

In [None]:
# scale composite values as with the centiloid scale considering 0 the value of the 50th percentile of controls
# and the as 1 the 50th percentile of the dementia samples mild dementia
# create the diagnosis masks
dementia_mask = (
    (neuro.diagnosis == 'dementia') & 
    (neuro.dementia_stage == 'mild')
)
# adjust composites
for composite in composites.columns:
    # select the lower and upper bounds
    lower_val = neuro.loc[dementia_mask][composite].quantile(0.5)
    upper_val = neuro.loc[normative_subjects.index.unique()][composite].quantile(0.5)
    
    # scale the values
    neuro[composite] = ((neuro[composite] - lower_val) / (upper_val - lower_val)) * 100

## Data exploration

In [None]:
# display intra-composite calculation
fig, ax = plt.subplots(figsize=(6, 4))
sns.heatmap(
    data=neuro[composites.columns.tolist()].corr(),
    annot=True, fmt=".2f",
    vmin=0, vmax=1,
    cmap='Blues'
)
plt.show()

In [None]:
fig, ax = plt.subplots(figsize=(16, 11))
sns.heatmap(
    data=neuro[composite_source_vars + composites.columns.tolist()].corr(),
    annot=True, fmt=".2f",
    vmin=0, vmax=1,
    cmap='Spectral_r'
)
# group test by cognitive domain
groups = {}
i = 0
for domain, tests in SEM_STRUCTURE.items():
    groups[domain] = (i, i+len(tests)-1)
    i += len(tests)
    
for group, (start, end) in groups.items():
    # Añadir un rectángulo alrededor del grupo
    rect = plt.Rectangle((start, start), end-start+1, end-start+1, fill=False, edgecolor='black', lw=2)
    ax.add_patch(rect)
plt.show()

In [None]:
# analyze the explained variance of each test
composite_ev = []
for domain, tests in SEM_STRUCTURE.items():
    for test in tests:
        x_var = neuro[f'{domain}_composite'].values[:, np.newaxis]
        y_var = neuro[test].values
        y_var_hat = LinearRegression(fit_intercept=True).fit(x_var, y_var).predict(x_var)
        ev = explained_variance_score(y_var, y_var_hat)
        composite_ev.append({'domain': domain, 'test': test, 'ev': ev})
composite_ev = pd.DataFrame(composite_ev)

composite_ev.set_index(['domain', 'test']).sort_index().round(2)

In [None]:
# same as before but for test that dont belong to the composite
composite_no_belong_ev = []
for domain, tests in SEM_STRUCTURE.items():
    for test in composite_source_vars:
        if test in tests: continue

        x_var = neuro[f'{domain}_composite'].values[:, np.newaxis]
        y_var = neuro[test].values
        y_var_hat = LinearRegression(fit_intercept=True).fit(x_var, y_var).predict(x_var)
        ev = explained_variance_score(y_var, y_var_hat)
        composite_no_belong_ev.append({'domain': domain, 'test': test, 'ev': ev})
        
composite_no_belong_ev = pd.DataFrame(composite_no_belong_ev)

composite_no_belong_ev.set_index(['domain', 'test']).sort_index().round(2)

In [None]:
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    # display composites distribution
    n_subplots = len(composites.columns)
    num_rows = (n_subplots + 1) // 2
    fig, axes = plt.subplots(num_rows, 2, figsize=(11, 3.5 * num_rows))
    axes = axes.flatten() if num_rows > 0 else [axes]

    for ax, composite in zip(axes, composites.columns):
        ax = sns.histplot(
            data=neuro,
            x=composite,
            hue='diagnosis',
            # kde=True,
            hue_order=['control', 'mci', 'dementia'],
            palette=['#27AE60', '#F5B041', '#C0392B'],
            stat='density',
            common_norm=False,
            ax=ax
        )
        ax.spines['top'].set_visible(False)
        ax.spines['right'].set_visible(False)
        ax.set_title(composite.replace('_composite', ''), size=15)
        ax.grid(alpha=0.2, color='black')
        ax.set_xlabel('Value')

    plt.subplots_adjust(hspace=0.5, wspace=0.25)
    plt.show()

In [None]:
# set minimum and maximum values to the range [-200, 200]
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    # display composites distribution
    n_subplots = len(composites.columns)
    num_rows = (n_subplots + 1) // 2
    fig, axes = plt.subplots(num_rows, 2, figsize=(11, 3.5 * num_rows))
    axes = axes.flatten() if num_rows > 0 else [axes]

    for ax, composite in zip(axes, composites.columns):
        neuro.loc[neuro[composite] < -200, composite] = -200
        neuro.loc[neuro[composite] > 200, composite] = 200
        ax = sns.histplot(
            data=neuro,
            x=composite,
            hue='diagnosis',
            # kde=True,
            hue_order=['control', 'mci', 'dementia'],
            palette=['#27AE60', '#F5B041', '#C0392B'],
            stat='density',
            common_norm=False,
            ax=ax
        )
        ax.spines['top'].set_visible(False)
        ax.spines['right'].set_visible(False)
        ax.set_title(composite.replace('_composite', ''), size=15)
        ax.grid(alpha=0.2, color='black')
        ax.set_xlabel('Value')

    plt.subplots_adjust(hspace=0.5, wspace=0.25)
    plt.show()

In [None]:
# export the generated data
neuro.to_parquet(
    PATH_TO_STUDY_DATA / ('%s_neuropsycho_imputed_composites_v1.0.parquet' % datetime.now().strftime('%Y%m%d'))
)