Notebook used to build the final databases used for modeling.

In [None]:
import os
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from pathlib import Path
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from scipy.interpolate import PchipInterpolator
from IPython.display import display
from tqdm import tqdm
from pprint import pprint
from datetime import datetime
from copy import deepcopy

In [None]:
# base path where the neuropsychological data is located
BASE_STUDY_PATH = Path(os.path.join('..', 'data'))

# path to the calculated composites
PATH_TO_COMPOSITES  = BASE_STUDY_PATH / 'adni' / 'processed' / '20240428_neuropsycho_imputed_composites_v1.0.parquet'

# path to the extracted image features and detected outliers
PATH_TO_IMAGE_FEATS = {
    'mri': {
        'data': BASE_STUDY_PATH / 'neuroimaging' / 'v1' / '20240428_MRI_intermodality_v0.parquet',
        'outliers': BASE_STUDY_PATH / 'neuroimaging' / 'v1' / '20240428_MRI_intermodality_v0_outliers_generated20240428.parquet'
    },
    'fdg': {
        'data': BASE_STUDY_PATH / 'neuroimaging' / 'v1' / '20240428_FDG_intermodality_v0.parquet',
        'outliers': BASE_STUDY_PATH / 'neuroimaging' / 'v1' / '20240428_FDG_intermodality_v0_outliers_generated20240428.parquet'
    },
    'amy': {
        'data': BASE_STUDY_PATH / 'neuroimaging' / 'v1' / '20240428_AMY_intermodality_v0.parquet',
        'outliers': BASE_STUDY_PATH / 'neuroimaging' / 'v1' / '20240428_AMY_intermodality_v0_outliers_generated20240428.parquet'
    },
}

# Minimum separation (in years) between two images belonging to the same subject to be considered as two different
# trajectories
NEUROIMAGING_ACQ_OFFSET = 2

# minimum and maximum number of allowed follow-ups (limits are included)
MIN_NUMBER_FOLLOWUPS = 2
MAX_NUMBER_FOLLOWUPS = 8

# minimum and maximum number of allowed years of follow-up (limits are included)
MIN_YEARS_FOLOWUP = 2
MAX_YEARS_FOLLOWUP = 4

# percentile based on the healthy controls used to establish a maximum value that a given neuropsychological
# composite can has. This pretends to reduce the variability in the subject-level trajectories
PERCENTILE_FOR_HC_CLIPPING = 0.5

# allowed window between modalities (in days)
CROSS_NEUROIMAGING_WINDOW = 30.5 * 3  # +/- 3 months

# allowed window between neuroimaging and neuropsychological data (in days)
CROSS_NEUROPSYCHO_WINDOW = 30.5 * 3  # +/- 3 months

# variables used to calculate the trayectories
TRAJECTORY_VARIABLES = [
    'memory_composite',
    'exec_composite',
    'language_composite',
    'visuospatial_composite'
]

In [None]:
def interpolateTimeSeries(
    t: np.ndarray,
    y: np.ndarray, 
    step: float,
    max_t: float
) -> tuple:
    """ Subroutine used to interpolate the input time series to regular
    sample points """
    
    # fit the split
    spl = PchipInterpolator(t, y)
    
    # interpolate time points using a step interval until max_t 
    sim_t = np.arange(0, max_t, step)
    sim_y = spl(sim_t)
    
    # select the last time point to restrict the extrapolation
    last_idx = np.argmin((np.abs(sim_t - t[-1]) - step)) + 1
    
    # select values
    sim_y = sim_y[:last_idx]
    sim_t = sim_t[:last_idx]

    return sim_t, sim_y


def setLayout(ax):
    """ Helping function to set axes layout """
    for pos in ['top', 'bottom', 'right', 'left']:
        ax.spines[pos].set_visible(False)
    ax.grid(alpha=0.3, color='black')
    
    return ax


def addYears(date, years):
    """ Helping function used to add years using float values """
    return date + pd.DateOffset(years=int(years), months=int((years % 1) * 12))


def adjustTrajectory(x: np.ndarray, y: np.ndarray, min_change: float, max_change: float) -> np.ndarray:
    """ Subroutine used to stabilize Neuropsychology score trajectories by setting a 
    maximum and minimum allowable change. """

    y = deepcopy(y)
    
    # If from the first evaluation to the minimum of the next two evaluations there is an increase of more than 
    # 50 points in absolute value, we assume that the first evaluation was incorrect and assign the average value 
    # of the next two evaluations. This is intended to reduce the effects of “fast retrievers” due to artifacts.
    # Apply this logic to all evaluations
    if len(y) >= 3:
        for i in range(len(y) - 2):
            if (y[i] - y[i+1:i+3].min()) < -50:
                y[i] = y[i+1:i+3].mean()

    # then clip allowable changes
    stable = False
    while not stable:
        # calculate the deltas
        delta_y = np.diff(y, prepend=y[0])
        delta_norm = delta_y / np.diff(x, prepend=1)
        
        # adjust the changes
        delta_norm = np.array([min(max(v, min_change), max_change) for v in delta_norm])
        
        # apply the inverse transform
        y_adj = delta_norm * np.diff(x, prepend=0) + np.append(y[0], y[:-1])
        
        # the trajectory is stable
        stable = np.sum(np.abs(y_adj - y)) < 0.1
    
        # update y variable
        y = y_adj
    
    return y


def smoothTrajectory(t: np.ndarray, y: np.ndarray, int_freq: float) -> tuple:
    """ Smooth the input trajectory by appying a moving LR model. """
    def _getLRAdjTraj(_x: np.ndarray, _y: np.ndarray, window: int) -> np.ndarray:
        # get linear regerssion predictions
        preds = []
        for t in range(window, len(_x)+1):
            min_t = t-window
            max_t = t

            # fit a linear regression model
            lr = LinearRegression(fit_intercept=True).fit(_x[min_t:max_t, np.newaxis], _y[min_t:max_t])
            preds.append(lr.predict(_x[min_t:max_t, np.newaxis]))
        # average predictions considering the time stamp
        preds_t = np.full((len(preds), len(preds[0]) + len(preds)-1), np.nan)

        for i in range(len(preds)):
            preds_t[i, i:i+len(preds[0])] = preds[i]

        return np.nanmean(preds_t, axis=0)
    
    # interpolate the time series
    int_t, int_y = interpolateTimeSeries(t, y, step=int_freq, max_t=np.max(t)+0.05)

    if len(int_t) >= 6:   # at least 6 time points
        # apply a moving window of size 4
        return int_t, _getLRAdjTraj(int_t, int_y, 4)
    elif len(int_t) >= 5:  # at lest 5 time points
        # apply a moving window of size 3
        return int_t, _getLRAdjTraj(int_t, int_y, 3)
    elif len(int_t) >= 3:
        # return the interpolated values
        return int_t, int_y
    else:
        # return the origin values
        return t, y
    

def crossNearestInfo(
    target_df: pd.DataFrame,
    source_df: pd.DataFrame,
    how: str,
    window: int
) -> pd.DataFrame:
    """ Function used to cross dataframes by nearest date """
    days_diff_colname = '_days_diff'

    # get date columns
    subject_id = target_df.index.names[0]
    target_date = target_df.index.names[1]
    source_date = source_df.index.names[1]
    
    # cross information
    crossed_df = target_df.join(source_df, how=how).reset_index()

    # select information in window
    crossed_df = crossed_df.loc[(crossed_df[target_date] - crossed_df[source_date]).dt.days.abs() < window].copy()

    # select the nearest date
    crossed_df[days_diff_colname] = (crossed_df[target_date] - crossed_df[source_date]).dt.days.abs()
    
    # remove possible duplicates
    crossed_df = crossed_df.reset_index().set_index([subject_id, target_date, days_diff_colname]).sort_index()
    crossed_df = crossed_df.groupby([subject_id, target_date]).nth(0).reset_index(days_diff_colname)
    
    assert not crossed_df.index.duplicated().any()

    crossed_df = crossed_df.drop(columns=[days_diff_colname, 'index'])

    return crossed_df

# Data loading and processing

## Neuroimaging information

In [None]:
# load all the neuroimaging information
cross_mod_data = {}
for mod, mod_files in PATH_TO_IMAGE_FEATS.items():
    # load the data and the outliers
    data = pd.read_parquet(mod_files['data'])
    outliers = pd.read_parquet(mod_files['outliers'])

    print('(%s) Init shape: %r' % (mod, list(data.shape)))

    # remove outliers
    data = data.loc[outliers.loc[outliers.outlier == 0].index].copy()
    
    print('(%s) Shape after removing outliers: %r' % (mod, list(data.shape)))

    # for different acquisitions performed in the same day select the last one
    data = data\
        .sort_index().reset_index('acquisition_id').drop(columns=['acquisition_id'])\
        .groupby(['subject_id', 'date']).nth(-1).copy()
    
    print('(%s) Shape after removing duplicated acquisitions: %r\n' % (mod, list(data.shape)))
    
    cross_mod_data[mod] = data


In [None]:
# generate a crossed dataset with all the data built upon the MRI 
mri_fdg = crossNearestInfo(
    target_df=cross_mod_data['mri'],
    source_df=cross_mod_data['fdg'],
    how='inner',
    window=CROSS_NEUROIMAGING_WINDOW
)

print('Number of entries MRI-FDG merging: %d' % len(mri_fdg))

mri_fdg_amy = crossNearestInfo(
    target_df=mri_fdg,
    source_df=cross_mod_data['amy'],
    how='inner',
    window=CROSS_NEUROIMAGING_WINDOW
)

print('Number of entries MRI-FDG-AMY merging: %d' % len(mri_fdg_amy))

assert mri_fdg_amy.shape[1] == (
    cross_mod_data['mri'].shape[1] + cross_mod_data['fdg'].shape[1] + cross_mod_data['amy'].shape[1])

# select all subject evaluations with at least NEUROIMAGING_ACQ_OFFSET
mri_fdg_amy_baseline = []
for sub_id, sub_df in mri_fdg_amy.groupby('subject_id'):
    if len(sub_df) == 1:
        mri_fdg_amy_baseline.append(sub_df)
        continue
    
    # select entries to add
    dates = sub_df.reset_index('date')['date']
    diff_from_baseline = (dates - dates.iloc[0]).dt.days.values

    index_to_add = [0]
    while  np.any(diff_from_baseline >= NEUROIMAGING_ACQ_OFFSET*365):
        idx = int(np.where(diff_from_baseline >= NEUROIMAGING_ACQ_OFFSET*365)[0][0])
        index_to_add.append(idx)
        diff_from_baseline -= diff_from_baseline[idx]

    # save the entry
    mri_fdg_amy_baseline.append(sub_df.iloc[index_to_add])

mri_fdg_amy_baseline = pd.concat(mri_fdg_amy_baseline, axis=0)
mri_fdg_amy_baseline = mri_fdg_amy_baseline.sort_index().copy()

print('Number of entries MRI-FDG-AMY baseline: %d' % len(mri_fdg_amy_baseline))

## Neuropsychological information

In [None]:
# load the composite scores
composites = pd.read_parquet(PATH_TO_COMPOSITES)
composites = composites.sort_index().copy()

In [None]:
# calculate the change normalized by year to limit the minimum and maximum allowable change
composites_change = composites.copy()
composites_change['neurobat_date_'] = composites_change.index.get_level_values('neurobat_date')
composites_change = composites_change[TRAJECTORY_VARIABLES].groupby('subject_id').diff().dropna().join(
    (composites_change['neurobat_date_'].diff().dt.days / 365).dropna()
)
for c in composites_change.columns:
    composites_change[c] = composites_change[c] / composites_change['neurobat_date_']
composites_change = composites_change.drop(columns=['neurobat_date_'])

min_composite_allowed_change = composites_change.quantile(0.2).to_dict()
max_composite_allowed_change = composites_change.quantile(0.8).to_dict()

min_composite_allowed_change, max_composite_allowed_change

In [None]:
# smooth erratic trajectories (potential outliers)
for sub_id, sub_df in tqdm(composites.groupby('subject_id')):
    if len(sub_df) < 3:
        continue

    for var in TRAJECTORY_VARIABLES:
        
        # gets a version where the abrupt changes in data are smoothed out
        y_adj = adjustTrajectory(
            sub_df['years_followup_neurobat'].values,
            sub_df[var].values, 
            min_change=min_composite_allowed_change[var],
            max_change=max_composite_allowed_change[var]
        )

       # apply smoothing by fitting linear regression models in windows and calculating a weighted average (when the 
       # number of follow-ups allows). 
        corr_t, corr_y = smoothTrajectory(
            sub_df['years_followup_neurobat'].values, 
            y_adj, 
            int_freq=1.0
        )
        # revert the interpolation to match the real time stamps
        corr_y = np.interp(sub_df['years_followup_neurobat'].values, corr_t, corr_y)

        # correct the values
        composites.loc[sub_id, var] = corr_y

        """  # Trajectory visualization for debugging
        plt.plot(
            sub_df['years_followup_neurobat'].values, corr_y, marker='o', label='smooth'
        )
        plt.plot(
            sub_df['years_followup_neurobat'].values,
            sub_df[var].values, marker='o', label='real'
        )
        plt.savefig(f'./temp/{sub_id}.png')
        plt.close()
        """

## Select neuropsychological information + neuroimaging + follow-up

In [None]:
# select subjects and entries with neuroimaging information and neuropsychological 
# follow-up information
composites_neu_long = composites.copy()
composites_neu_long = composites_neu_long.loc[
    mri_fdg_amy_baseline.index.get_level_values('subject_id').unique()
].copy()

# for those subjects with MRI, FDG, and Amyloid, remove all the evaluations
# before the first acquisition
index_to_remove = []
for sub_id, sub_date in mri_fdg_amy_baseline.groupby('subject_id').nth(0).index:

    # calculate the minimum date adding an offset
    min_date = sub_date - pd.DateOffset(
        years=int(CROSS_NEUROPSYCHO_WINDOW / 365),
        months=int(((CROSS_NEUROPSYCHO_WINDOW / 365) % 1) * 12))

    # get the indices that will be removed
    index_ = composites_neu_long.loc[[sub_id]].loc[
        composites_neu_long.loc[sub_id].index < min_date
    ].index
    
    # save the indices to remove
    index_to_remove.append(index_)

print('Initial neuropsychological entries: %d' % len(composites_neu_long))

for index_ in index_to_remove:
    composites_neu_long = composites_neu_long.drop(index=index_)
    
print('Neuropsychological entries after removing indices with past data: %d' % len(composites_neu_long))

print('Number of patients after removing indices: %d' % len(
    composites_neu_long.index.get_level_values('subject_id').unique()
))

In [None]:
# to leverage repeated acquisitions, get trajectory-level information. Also filter entries by inclusion criteria
visited_subjects = {}
miss_in_removed_indices = []
miss_by_num_followups = []
miss_by_time_followup = []
miss_by_neuropsycho_window = []
composites_neu_long_window = []

unique_composite_subjects = composites_neu_long.index.get_level_values('subject_id').unique()
for sub_id, sub_date in mri_fdg_amy_baseline.index:

    # apply the same offset than in the previous code to select the evaluations
    init_date = sub_date - pd.DateOffset(
        years=int(CROSS_NEUROPSYCHO_WINDOW / 365),
        months=int(((CROSS_NEUROPSYCHO_WINDOW / 365) % 1) * 12))
    
    if sub_id not in unique_composite_subjects:
        miss_in_removed_indices.append(sub_id)
        continue
    
    # select all valid evaluations
    sub_df = composites_neu_long.loc[[sub_id]]
    sub_df = sub_df.loc[
        sub_df.index.get_level_values('neurobat_date') >= init_date
    ].copy()

    # check number of follow-ups
    if len(sub_df) < MIN_NUMBER_FOLLOWUPS:
        miss_by_num_followups.append((sub_id, sub_date))
        continue

    # check follow-up time
    followup_time = float(
        np.cumsum(sub_df.index.get_level_values('neurobat_date').to_series().diff().dt.days.fillna(0).values)[-1] / 365
    )

    if followup_time < MIN_YEARS_FOLOWUP:
        miss_by_time_followup.append((sub_id, sub_date))
        continue

    # add neuroimaging date
    sub_df['date'] = sub_date
    sub_df = sub_df.reset_index('neurobat_date')

    # check neuropsychological window
    if not ((sub_df['neurobat_date'] - sub_df['date']).dt.days <= CROSS_NEUROPSYCHO_WINDOW).any():
        miss_by_neuropsycho_window.append((sub_id, sub_date))

    # add the trayectory id
    if sub_id in visited_subjects:
        sub_df['trajectory_id'] = '{}_{}'.format(sub_id, visited_subjects[sub_id])
        visited_subjects[sub_id] += 1
    else:
        sub_df['trajectory_id'] = sub_id
        visited_subjects[sub_id] = 1

    composites_neu_long_window.append(sub_df)

# concatenate the trajectories
composites_neu_long_window = pd.concat(
    composites_neu_long_window, axis=0).reset_index().set_index(['trajectory_id', 'neurobat_date']).sort_index().copy()

# recalculate follow-up time
composites_neu_long_window['years_followup_neurobat'] = composites_neu_long_window.groupby('trajectory_id')['years_followup_neurobat'].diff().fillna(0)
composites_neu_long_window['years_followup_neurobat'] = composites_neu_long_window.groupby('trajectory_id')['years_followup_neurobat'].cumsum()

print('Number of subjects removed by matching with neuropsychological information: %d' % len(miss_in_removed_indices))
print('Number of subjects removed by number of follow-ups: %d' % len(miss_by_num_followups))
print('Number of subjects removed by follow-up time: %d' % len(miss_by_time_followup))
print('Number of subjects removed by neuropsychological window: %d' % len(miss_by_neuropsycho_window))
print('Number of unique trajectories: %d' % len(composites_neu_long_window.index.get_level_values('trajectory_id').unique()))
print('Number of unique subjects: %d' % len(composites_neu_long_window['subject_id'].unique()))
print('Number of entries associated with neuroimaging and neuropsychological data: %d' % len(composites_neu_long_window))

In [None]:
# Select the remaining information from the composites used to calculate the deltas by replacing the past 
# evaluations of the subjects just selected in the previous step. This is done because for the previous 
# subjects we are only interested in tracking from the time of image acquisition.
composites_no_trajectory = composites.loc[
    ~composites.index.get_level_values('subject_id').isin(composites_neu_long_window['subject_id'].unique())
].copy()
composites_no_trajectory['trajectory_id'] = composites_no_trajectory.index.get_level_values('subject_id')
composites_no_trajectory = composites_no_trajectory.reset_index().set_index(['trajectory_id', 'neurobat_date']).sort_index()
composites_no_trajectory['date'] = pd.NaT

composites_long = pd.concat((composites_no_trajectory, composites_neu_long_window), axis=0).sort_index()

assert not composites_long.index.duplicated().any()

# Generate a dataset with the remaining subjects used to pre-train the models

# add information about the current follow-up
composites_long['n_followup'] = 1
composites_long['n_followup'] = composites_long.groupby('trajectory_id')['n_followup'].cumsum()

# add information about the maximum follow-up
composites_long = composites_long.join(
    pd.DataFrame(composites_long.groupby('trajectory_id')['n_followup'].max())\
        .rename(columns={'n_followup': 'max_followup'}),
    how='left'
)

# add information about the follow-up time
composites_long = composites_long.join(
    pd.DataFrame(composites_long.groupby('trajectory_id')['years_followup_neurobat'].max())\
        .rename(columns={'years_followup_neurobat': 'max_years_followup_neurobat'}),
    how='left'
)

# select subjects with a minimum number of follow-ups and follow-up time
followup_mask = (composites_long['max_years_followup_neurobat'] >= MIN_YEARS_FOLOWUP)

print('Initial number of subjects: %d' % len(composites_long.index.get_level_values('trajectory_id').unique()))
print('Number of subjects after filtering by follow-up time: %d' % len(composites_long.loc[followup_mask].index.get_level_values('trajectory_id').unique()))

The following code performs an interpolation at regular time intervals on the different scores of the neuropsychological measures.

In [None]:
# apply an interpolation of values at regular intervals to smooth trajectories 
# (use only subjects with follow-up information)

# interpolate the time series
composites_intp = []
for sub_id, sub_df in tqdm(composites_long.loc[followup_mask].groupby('trajectory_id'), desc='Performing interpolation...'):

    # perform interpolations
    sub_df_intp = {}
    for var in TRAJECTORY_VARIABLES:
        var_t, var_y = interpolateTimeSeries(
            sub_df['years_followup_neurobat'].values, 
            sub_df[var].values, 
            step=0.5, max_t=MAX_YEARS_FOLLOWUP+0.1
        )
        sub_df_intp['years_followup_neurobat'] = var_t
        sub_df_intp[var] = var_y
        
    # create the dataframe
    sub_df_intp['trajectory_id'] = np.array([sub_id] * len(var_t))
    sub_df_intp['subject_id'] = np.array([sub_df['subject_id'].values[0]] * len(var_t))
    sub_df_intp['neurobat_date'] = np.array([sub_df.reset_index().iloc[0].loc['neurobat_date']] * len(var_t))
    sub_df_intp = pd.DataFrame(sub_df_intp)
    
    # process dates
    sub_df_intp['neurobat_date'] = pd.to_datetime(sub_df_intp['neurobat_date'])
    sub_df_intp['neurobat_date'] = sub_df_intp.apply(lambda row: addYears(row['neurobat_date'], row['years_followup_neurobat']), axis=1)
    sub_df_intp['neurobat_date'] = pd.to_datetime(sub_df_intp['neurobat_date'])
    
    # set index
    sub_df_intp = sub_df_intp.set_index(['trajectory_id', 'neurobat_date']).sort_index()

    # save results
    composites_intp.append(sub_df_intp.copy())

composites_intp = pd.concat(composites_intp, axis=0)
composites_intp

In [None]:
# display some random trajectories
np.random.seed(1997)
nrows = 3
ncols = 4
varidx = 1

sample_ids = np.random.choice(composites_intp.index.get_level_values('trajectory_id').unique(), size=nrows*ncols)
fig, axes = plt.subplots(nrows, ncols, figsize=(20, 8))
for ax, _id in zip(axes.flatten(), sample_ids):
    
    t_intp = composites_intp.loc[_id]['years_followup_neurobat'].values
    y_intp = composites_intp.loc[_id][TRAJECTORY_VARIABLES[varidx]].values
    t_true = composites_long.loc[_id]['years_followup_neurobat'].values
    y_true = composites_long.loc[_id][TRAJECTORY_VARIABLES[varidx]].values

    for label, c, x, y in [
        ('Real', 'blue', t_true, y_true),
        ('Interp', 'orange', t_intp, y_intp),
    ]:
        ax.scatter(x, y, label=None, s=25, color=c)
        ax.plot(x, y, label=label, color=c)
    
    ax.legend(loc='upper left')
    setLayout(ax)
    ax.set_title('Id: %s' % _id)

fig.tight_layout()
plt.show()

## Trajectory $\Delta$ calculation


A linear regression model is adjusted for each individual in order to estimate the overall tendency of the data cognitive trajectory.

For subjects where the fit index of the regression model is poor ($R^2$), the slope will be calculated considering the first and last value.

In [None]:
# get normative subjects
sel_vars = ['diagnosis', 'mmse', 'cdr']
composites_long_diag = composites_long.loc[followup_mask]\
    .reset_index().set_index(['subject_id', 'neurobat_date'])\
    .groupby(['subject_id', 'neurobat_date']).nth(0).sort_index()\
    .groupby('subject_id').nth([0, -1])[sel_vars].copy()
composites_long_diag['mark'] = 1
composites_long_diag['mark'] = composites_long_diag.groupby('subject_id')['mark'].cumsum()
composites_long_diag = composites_long_diag\
    .reset_index().drop(columns=['neurobat_date']).set_index(['subject_id', 'mark'])
composites_long_diag = composites_long_diag\
    .reset_index().pivot(columns=['mark'], values=sel_vars, index=['subject_id'])
composites_long_diag.columns = ['{}_{}'.format(*c) for c in composites_long_diag.columns]

normative_subjects = composites_long_diag.loc[
    (composites_long_diag.diagnosis_1 == 'control') & 
    (composites_long_diag.diagnosis_2 == 'control') & 
    (composites_long_diag.mmse_1 >= 26) & 
    (composites_long_diag.mmse_2 >= 26) &
    (composites_long_diag.cdr_1 == 0.0) & 
    (composites_long_diag.cdr_2 == 0.0)
].index.tolist()


print('Number of normative subjects: %d\n' % len(normative_subjects))

# select subjects with negative amyloid status
amy_neg_subject_ids = composites_long.loc[composites_long.uc_berkley_negative_amyloid_1Y == 1.0]['subject_id'].unique().tolist()

normative_subjects = [sub_id for sub_id in normative_subjects if sub_id in amy_neg_subject_ids]
    
print('(after filtering by amyloid status) Number of normative subjects: %d\n' % len(normative_subjects))

In [None]:

# get the normative cut-off value for each composite based on the median value of the controls
normative_cutoff_vals = composites_intp.loc[normative_subjects][TRAJECTORY_VARIABLES].quantile(0.5).to_dict()
pprint(normative_cutoff_vals)

# display normative individual trayectories compared to the rest of the population
fig, axes = plt.subplots(2, 2, figsize=(10, 5))
axes = list(axes.flatten())

for sub_id, sub_df in composites_intp.groupby('trajectory_id'):
    for var, ax in zip(TRAJECTORY_VARIABLES, axes):
        color = 'green' if sub_id in normative_subjects else 'grey'
        alpha = 0.5 if sub_id in normative_subjects else 0.2
        ax.plot(
            sub_df['years_followup_neurobat'].values,
            sub_df[var].values,
            alpha=alpha, color=color
        )

for var, ax in zip(TRAJECTORY_VARIABLES, axes):
    setLayout(ax)
    ax.axhline(normative_cutoff_vals[var], lw=2, color='red')
    ax.set_title(var)
    ax.set_xlabel('Years')
    ax.set_ylabel('Value')

plt.subplots_adjust(hspace=0.5)
plt.show()

# apply the clipping
fig, axes = plt.subplots(2, 2, figsize=(8, 5))
axes = list(axes.flatten())

for ax, var in zip(axes, TRAJECTORY_VARIABLES):
    mask = composites_intp[var] >= normative_cutoff_vals[var]
    composites_intp.loc[mask][var].hist(alpha=0.5, density=False, label='Ignored', ax=ax)
    composites_intp.loc[~mask][var].hist(alpha=0.5, density=False, label='No ignored', ax=ax)
    ax.set_title(var)
    ax.legend()
    composites_intp.loc[mask, var] = normative_cutoff_vals[var]
    
plt.subplots_adjust(hspace=0.4)
plt.show()
    

In [None]:
# calculate the average slope per year for each subject
r2_cutoff = 0.5
show_plots = 10
deltas_df = []
bad_adjustment = []
for sub_id, sub_df in composites_intp.groupby('trajectory_id'):
    for var in TRAJECTORY_VARIABLES:        
        lr = LinearRegression().fit(
            sub_df[['years_followup_neurobat']].values,
            sub_df[var].values
        )
        lr_slope = lr.coef_[0]

        # calculate the goodness of fit
        r2 = r2_score(
            sub_df[var].values,
            lr.predict(sub_df[['years_followup_neurobat']].values)
        )
        
        # for subjects below the R2 cut-off calculate the slope considering
        # the initial and final times
        if r2 < r2_cutoff:
            bad_adjustment.append(sub_id)
            
            # calculate the slope considering the initial and final points
            init_val = sub_df[var].values[sub_df.years_followup_neurobat.values <= 1.0].max()
            end_val = sub_df[var].values[sub_df.years_followup_neurobat.values > (sub_df.years_followup_neurobat.max() - 1.0)].max()
            slope = (end_val - init_val) / sub_df.years_followup_neurobat.max()
            
            # recalculate the intercept baed on the new slope (optimal intercept calculated using an explicit derivation)
            intercept = np.mean(sub_df[var].values - slope * sub_df.years_followup_neurobat.values)

            # caculate the residual error
            y_hat = (intercept + slope * sub_df.years_followup_neurobat).values
            y_true = sub_df[var].values
            rmse = float(np.sqrt(np.mean((y_hat - y_true)**2)))

            if len(bad_adjustment) < show_plots:
                
                #  display the model adjustment
                fig, ax = plt.subplots(figsize=(4, 2))
                ax.plot(
                    sub_df.years_followup_neurobat.values,
                    sub_df[var].values,
                )
                ax.scatter(
                    sub_df.years_followup_neurobat.values,
                    sub_df[var].values,
                )
                ax.plot(
                    [0, sub_df.years_followup_neurobat.max()],
                    [lr.intercept_, lr.intercept_ +  lr.coef_[0] * sub_df.years_followup_neurobat.max()],
                    lw=2, color='red', label=f'LR slope ({lr.coef_[0]:.1f})'
                )
                ax.plot(
                    [0, sub_df.years_followup_neurobat.max()],
                    [intercept, intercept +  slope * sub_df.years_followup_neurobat.max()],
                    lw=2, color='green', label=f'Adj slope ({slope:.1f})'
                )
                setLayout(ax)
                ax.legend()
                ax.set_ylabel(var)
                ax.set_xlabel('Years')
                ax.set_title('{} - $r^2 = ${:.2f} (rmse = {:.2f})'.format(sub_id, r2, rmse))
                plt.show()
            
            deltas_df.append({
                'trajectory_id': sub_id,
                'variable': var,
                'baseline': sub_df[var].values[0],
                'last': sub_df[var].values[-1],
                'lr_delta': slope,
                'lr_rmse': rmse,
                'lr_r2': r2
            })
        else:
            # caculate the residual error
            y_hat = (float(lr.intercept_) + lr.coef_[0] * sub_df.years_followup_neurobat).values
            y_true = sub_df[var].values
            rmse = float(np.sqrt(np.mean((y_hat - y_true)**2)))

            # save the results of the LR
            deltas_df.append({
                'trajectory_id': sub_id,
                'variable': var,
                'baseline': sub_df[var].values[0],
                'last': sub_df[var].values[-1],
                'lr_delta': lr.coef_[0],
                'lr_rmse': rmse,
                'lr_r2': r2
            })

# create the dataframe
deltas_df = pd.DataFrame(deltas_df)

# pivot the dataframe
deltas_df = deltas_df.pivot(
    index=['trajectory_id'],
    columns=['variable'],
    values=['baseline', 'last', 'lr_delta', 'lr_rmse', 'lr_r2']
)

# flat columns
deltas_df.columns = ['{}_{}'.format(e[0], e[1]) for e in deltas_df.columns]
deltas_df

In [None]:
# display R2
deltas_df[[c for c in deltas_df.columns if 'r2' in c]].hist();

### Cut-off definition


In this part of the code, cut-off points of normality/non-normality are established according to the distribution of deltas in the group of normative control subjects.

In [None]:
# calculate the stable vs decline trajectory
cognitive_normal_cutoff = 75           # required starting/end value of the cognitive normal subject
cognitive_normal_quantile = 0.05        # samples below the X% of the CN subjects will be marked as decliners
delta_cutoffs = {}

for var in TRAJECTORY_VARIABLES:
    cutoff = cognitive_normal_cutoff
    quantile = cognitive_normal_quantile

    # calculate the mask of normative subjects
    var_mask = (
        (deltas_df.loc[normative_subjects]['baseline_%s' % var] >= cutoff) & 
        (deltas_df.loc[normative_subjects]['last_%s' % var] >= cutoff))
    delta_var = 'lr_delta_%s' % var

    print('(%s) Number of normative subjects: %d' % (var, var_mask.sum()))
    delta_cutoffs[delta_var] = deltas_df.loc[normative_subjects].loc[var_mask][delta_var].quantile(quantile)
    print('Cut-off: %.3f' % delta_cutoffs[delta_var])

    # apply the cut-off
    delta_var_bin = (deltas_df[delta_var] < delta_cutoffs[delta_var])
    deltas_df['%s_binary' % delta_var] = delta_var_bin.astype(int)

    print('Percentage of positive class: {:.2f}%'.format(delta_var_bin.mean() * 100))
    print()

### Data exploration

In [None]:
# anlayze the calcualted deltas (code adapted from other versions... a little bit messy)
composites_long_with_deltas = composites_long.join(deltas_df)
composites_long_with_deltas_baseline = composites_long_with_deltas.groupby('trajectory_id').nth(0)
composites_long_with_deltas_last = composites_long_with_deltas.groupby('trajectory_id').nth(-1)
composites_long_with_deltas_baseline = \
    composites_long_with_deltas_baseline.reset_index('neurobat_date').rename(columns={'diagnosis': 'baseline_diagnosis'}).join(
        composites_long_with_deltas_last.reset_index('neurobat_date')[['diagnosis']].rename(columns={'diagnosis': 'last_diagnosis'})
)
composites_long_with_deltas_baseline['diagnosis'] = composites_long_with_deltas_baseline['baseline_diagnosis']
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    for var in TRAJECTORY_VARIABLES:
        delta_var = 'lr_delta_%s' % var

        # plot distributions in baseline and last diagnosis
        fig, axes = plt.subplots(1, 2, figsize=(10, 4))
        axes = [ax for ax in axes.flatten()]
        
        for ax, flag in zip(axes, ['baseline', 'last']):
            sns.boxplot(
                data=composites_long_with_deltas_baseline,
                y=delta_var,
                x='diagnosis',
                notch=True, showcaps=False,
                flierprops={"marker": "x"},
                boxprops={"facecolor": (.3, .5, .7, .5)},
                medianprops={"color": "r", "linewidth": 2},
                zorder=2,
                ax=ax
            )
            sns.stripplot(
                data=composites_long_with_deltas_last,
                x='diagnosis', 
                y=delta_var,  
                color="0.3", 
                alpha=0.3,
                zorder=1,
                ax=ax,
            
            )
            ax.axhline(delta_cutoffs['lr_delta_%s' % var], lw=4, color='#00FF00')
            ax.set_xlabel('')
            ax.set_title('Diagnosis (%s)' % flag, size=15)
            ax.spines['top'].set_visible(False)
            ax.spines['right'].set_visible(False)
            ax.grid(alpha=0.3, color='black')
        fig.suptitle(var, y=1.05, size=17)
        plt.show()
        
        display(pd.concat([
            pd.DataFrame(
                composites_long_with_deltas_baseline.groupby(['baseline_diagnosis', 'last_diagnosis'])[delta_var].count()
            ).rename(columns={'lr_delta_%s' % var: 'count_%s' % var}),
            pd.DataFrame(
                composites_long_with_deltas_baseline.groupby(['baseline_diagnosis', 'last_diagnosis'])[delta_var].mean()
            ).rename(columns={'lr_delta_%s' % var: 'mean_delta_%s' % var}).round(decimals=2),
            pd.DataFrame(
                composites_long_with_deltas_baseline.groupby(['baseline_diagnosis', 'last_diagnosis'])[delta_var].quantile(0.10)
            ).rename(columns={'lr_delta_%s' % var: 'q10_delta_%s' % var}).round(decimals=2),
            pd.DataFrame(
                composites_long_with_deltas_baseline.groupby(['baseline_diagnosis', 'last_diagnosis'])[delta_var].quantile(0.25)
            ).rename(columns={'lr_delta_%s' % var: 'q25_delta_%s' % var}).round(decimals=2),
            pd.DataFrame(
                composites_long_with_deltas_baseline.groupby(['baseline_diagnosis', 'last_diagnosis'])[delta_var].quantile(0.90)
            ).rename(columns={'lr_delta_%s' % var: 'q90_delta_%s' % var}).round(decimals=2)
        ], axis=1))
        
        print('\n\n')

# Final database exportation

We are only interested in the deltas dataframe for those subjects with associated neuroimaging data and individuals with a baseline diagnosis different from dementia

In [None]:
# select variables of interest for the rest of clinical information
composites_long_with_deltas = composites_long.loc[deltas_df.index].copy()
composites_long_with_deltas = composites_long_with_deltas[[
    'subject_id', 'diagnosis', 'yschooling', 'sex_1M_2F', 
    'age', 'mmse', 'cdr', 'years_followup_neurobat',
    'date',

    # add neuropsychological test
    'memory_avlt_trial_1',
    'memory_avlt_trial_2',
    'memory_avlt_trial_6',
    'memory_avlt_delayed',
    'memory_avlt_recognition',
    'memory_word_recognition',
    'memory_word_recall_delayed',
    'memory_word_recall',
    'language_cat_fluency',
    'language_naming', 
    'language_bnt_tot',
    'language_word_finding_diff',
    'exec_tmt_a_time',
    'exec_tmt_b_time',
    'attention_number_cancellation',
    'visuos_clock_copy_tot_score',
    'visuos_clock_draw_tot_score',
    'visuos_constructional_praxis',
    'visuos_ideational_praxis'
]].sort_index().copy()

# pivot the baseline and last information
composites_long_with_deltas_baseline = composites_long_with_deltas.reset_index('neurobat_date').groupby('trajectory_id').nth(0)
composites_long_with_deltas_baseline.columns = ['baseline_%s' % c for c in composites_long_with_deltas_baseline.columns]
composites_long_with_deltas_last = composites_long_with_deltas.reset_index('neurobat_date').groupby('trajectory_id').nth(-1)
composites_long_with_deltas_last.columns = ['last_%s' % c for c in composites_long_with_deltas_last.columns]
composites_long_with_deltas = pd.concat([
    composites_long_with_deltas_baseline, composites_long_with_deltas_last
], axis=1)
composites_long_with_deltas = composites_long_with_deltas\
    .drop(columns=['last_subject_id', 'last_date'])\
    .rename(columns={'baseline_subject_id': 'subject_id', 'baseline_date': 'date'})

# add clinical information to the calculated deltas (add basline and last evaluation to the deltas)
deltas_df = deltas_df.join(composites_long_with_deltas, how='inner')

# add neuroimaging infomration to the calculated deltas
deltas_with_neuroimaging_df =\
    deltas_df.dropna(subset=['date']).reset_index().set_index(['subject_id', 'date']).sort_index().join(
        mri_fdg_amy_baseline, how='inner'
)

# remove baseline diagnosis of dementia
deltas_with_neuroimaging_df = deltas_with_neuroimaging_df.loc[deltas_with_neuroimaging_df.baseline_diagnosis != 'dementia'].copy()

# transform trajectory variables to float 32
qual_delta_vars = [c for c in deltas_with_neuroimaging_df.columns if c.startswith('lr_delta')]
deltas_with_neuroimaging_df[qual_delta_vars] = deltas_with_neuroimaging_df[qual_delta_vars].astype(np.float32)

# add information of 2Y and 4Y diagnosis
deltas_with_neuroimaging_df = deltas_with_neuroimaging_df\
    .reset_index()\
    .set_index(['trajectory_id', 'baseline_neurobat_date'])\
    .join(
        composites_long[['diagnosis_2Y', 'diagnosis_4Y']]\
            .fillna(np.nan).reset_index()\
            .rename(columns={'neurobat_date': 'baseline_neurobat_date'})\
            .set_index(['trajectory_id', 'baseline_neurobat_date']),
        how='left')\
    .reset_index()\
    .set_index(['subject_id', 'date'])

In [None]:
# display some sample statistics
sample_demo = []
for diag, diag_df in deltas_with_neuroimaging_df.groupby('baseline_diagnosis'):
    diag_des_dat = {
        'Diagnosis': diag, 
        'Sample size (%)': '{:.0f} ({:.1f})'.format(diag_df.shape[0], diag_df.shape[0] / deltas_with_neuroimaging_df.shape[0] * 100),
        'Age (Mean, SD)': '{:.1f} ({:.1f})'.format(
            diag_df['baseline_age'].mean(), 
            diag_df['baseline_age'].std()),
        'Sex (% Female)': '{:.1f}'.format(
             (diag_df['baseline_sex_1M_2F'] -1).mean() * 100
        ),
        'Years of formal education (Mean, SD)': '{:.1f} ({:.1f})'.format(
            diag_df['baseline_yschooling'].mean(), 
            diag_df['baseline_yschooling'].std()),
        'MMSE (Mean, SD)': '{:.1f} ({:.1f})'.format(
            diag_df['baseline_mmse'].mean(), 
            diag_df['baseline_mmse'].std()),
        'Years followup (Mean, SD)': '{:.1f} ({:.1f})'.format(
            diag_df['last_years_followup_neurobat'].mean(), 
            diag_df['last_years_followup_neurobat'].std()),
    }
        
    sample_demo.append(
        diag_des_dat
    )
sample_demo = pd.DataFrame(sample_demo).set_index('Diagnosis').loc[[
    'control', 
    'mci'
]]
sample_demo.T

In [None]:
# display some delta-related statistics
#   - One row per diagnosis transition
#   - Columns: Average delta in each cognitive domain and percentage of decliners per each cognitive domain
temp_df = deltas_with_neuroimaging_df.copy()

# calculate the number of affected cognitive domains
temp_df['num_affected_cd'] = temp_df[[c for c in temp_df.columns if 'binary' in c]].sum(axis=1)

# calculate diagnosis transitions
deltas_info = {}
temp_df['diag_transition'] = temp_df['baseline_diagnosis'] + '-' + temp_df['last_diagnosis']
temp_df.loc[temp_df['diag_transition'].isin(['control-control']), 'diag_transition'] = 'sCN'
temp_df.loc[temp_df['diag_transition'].isin(['control-mci', 'control-dementia']), 'diag_transition'] = 'pCN'
temp_df.loc[temp_df['diag_transition'].isin(['mci-mci']), 'diag_transition'] = 'sMCI'
temp_df.loc[temp_df['diag_transition'].isin(['mci-dementia']), 'diag_transition'] = 'pMCI'
temp_df.loc[temp_df['diag_transition'].isin(['mci-control']), 'diag_transition'] = 'rMCI'
for diag_trans in ['sCN', 'pCN', 'sMCI', 'pMCI', 'rMCI']:
    diag_trans_df = temp_df.loc[temp_df['diag_transition'] == diag_trans]
    domain_info = {}
    for fname, dname in [
        ('Executive', 'lr_delta_exec_composite'),
        ('Memory', 'lr_delta_memory_composite'),
        ('Language', 'lr_delta_language_composite'),
        ('Visuospatial', 'lr_delta_visuospatial_composite')]:
        domain_info[f'{fname} - quant'] = f'{diag_trans_df[dname].mean():.1f} ({diag_trans_df[dname].std():.1f})'
        domain_info[f'{fname} - decliners'] = \
            f'{diag_trans_df[f"{dname}_binary"].sum():.0f} ({(diag_trans_df[f"{dname}_binary"].mean())*100:.1f})'
            #f'{diag_trans_df[f"{dname}_binary"].sum():.0f} ({(diag_trans_df[f"{dname}_binary"].sum() / temp_df.shape[0])*100:.1f})'

    # calculate the individuals with more than 1 domain affected
    more_than_2_domains = (diag_trans_df[[c for c in diag_trans_df.columns if c.endswith('binary')]].sum(axis=1) > 1)
    domain_info['> 2 affected domains'] = f'{float(more_than_2_domains.sum()):.0f} ({float(more_than_2_domains.mean()) * 100:.1f})'

    deltas_info[f'{diag_trans} ({diag_trans_df.shape[0]})'] = domain_info

deltas_info = pd.DataFrame(deltas_info).T
deltas_info = deltas_info[[c for c in deltas_info.columns if 'quant' in c] + [c for c in deltas_info.columns if 'decliners' in c] + ['> 2 affected domains']]

In [None]:
# display some delta-related statistics (2)
sample_traj = []
for fname, dname in [
    ('Executive', 'lr_delta_exec_composite'),
    ('Memory', 'lr_delta_memory_composite'),
    ('Language', 'lr_delta_language_composite'),
    ('Visuospatial', 'lr_delta_visuospatial_composite')]:
    domain_info = {}
    for group, group_df in deltas_with_neuroimaging_df.groupby('%s_binary' % dname):
        domain_info[group] = {}

        # add delta statistics
        domain_info[group]['Delta (Mean, SD)'] =\
            '{:.2f} ({:.2f})'.format(group_df[dname].mean(), group_df[dname].std())
        group_diag_trans = (group_df['baseline_diagnosis'] + '-' + group_df['last_diagnosis']).value_counts().to_dict()
        """
        domain_info[group]['Stable control'] = f"{group_diag_trans.get('control-control', 0.0) / group_df.shape[0] * 100:.1f}"
        domain_info[group]['Stable MCI'] = f"{group_diag_trans.get('mci-mci', 0.0) / group_df.shape[0] * 100:.1f}"
        domain_info[group]['MCI converter'] = f"{group_diag_trans.get('control-mci', 0.0) / group_df.shape[0] * 100:.1f}"
        domain_info[group]['Demencia converter'] = f"{(group_diag_trans.get('mci-dementia', 0.0) + group_diag_trans.get('control-dementia', 0.0) ) / group_df.shape[0] * 100:.1f}"
        """
        stable_diagnosis = (
            group_diag_trans.get('control-control', 0.0) + 
            group_diag_trans.get('mci-mci', 0.0) + 
            group_diag_trans.get('mci-control', 0.0) 
        )
        progresive_diagnosis = (
            group_diag_trans.get('control-dementia', 0.0) + 
            group_diag_trans.get('control-mci', 0.0) + 
            group_diag_trans.get('mci-dementia', 0.0) 
        )
        domain_info[group]['Stable diagnosis'] = f"{stable_diagnosis / group_df.shape[0] * 100:.1f}"
        domain_info[group]['Progressive diagnosis'] = f"{progresive_diagnosis / group_df.shape[0] * 100:.1f}"


    # save the dataframe
    domain_info_df = pd.DataFrame(domain_info).T
    domain_info_df['Domain'] = fname
    domain_info_df.index.names = ['Group']
    domain_info_df = domain_info_df.reset_index().set_index(['Domain', 'Group'])
    sample_traj.append(domain_info_df)

sample_traj = pd.concat(sample_traj, axis=0).sort_index()
sample_traj

In [None]:
# export the generated dataframe
deltas_with_neuroimaging_df.to_parquet(
    BASE_STUDY_PATH / 'mounts' / 'v1' / ('%s_longitudinal.parquet' % datetime.now().strftime('%Y%m%d'))
)

## Cross-sectional dataset

Selection of cross-sectional data for pre-training of models using the different neuroimaging modalities independently. At this point, all images that have been included in the longitudinal data set have been excluded.

In [None]:
# select the neuropsychological data
composites_cross_seccional = composites.fillna(np.nan).copy()

# add delta information to the neuropsychological data
composites_cross_seccional = composites_cross_seccional.join(
    deltas_df\
        .reset_index()\
        .rename(columns={'baseline_neurobat_date': 'neurobat_date'})\
        .set_index(['subject_id', 'neurobat_date'])[[
        'lr_delta_exec_composite', 'lr_delta_language_composite', 
        'lr_delta_memory_composite', 'lr_delta_visuospatial_composite',
        'lr_delta_exec_composite_binary', 'lr_delta_language_composite_binary', 
        'lr_delta_memory_composite_binary', 'lr_delta_visuospatial_composite_binary'
    ]],
    how='left'
)

In [None]:
# target modling variables for model pretraining
target_vars = [
    'diagnosis',
    'diagnosis_2Y',
    'diagnosis_4Y',
    'memory_composite', 
    'exec_composite',
    'language_composite', 
    'visuospatial_composite',
    'lr_delta_exec_composite', 
    'lr_delta_language_composite', 
    'lr_delta_memory_composite', 
    'lr_delta_visuospatial_composite'
]

# add the neuropsychological data to the neuroimaging data
cross_sectional_datasets = {}
for mod_key, mod_df in cross_mod_data.items():
    init_entries = mod_df.shape[0]
    init_subjects = len(mod_df.index.get_level_values('subject_id').unique())

    # cross the data
    crossed_mod_df = crossNearestInfo(
        target_df=mod_df,
        source_df=composites_cross_seccional,
        how='left',
        window=CROSS_NEUROPSYCHO_WINDOW
    )#.drop_duplicates()

    # remove duplicated entries in the target variables
    crossed_mod_df = crossed_mod_df.loc[
        ~crossed_mod_df[target_vars].duplicated()
    ].copy()

    # for MRI filter subjects without delta values or 2Y/4Y diagnosis
    if mod_key == 'mri':
        missing_mask = (
            (
                crossed_mod_df[[
                    'lr_delta_exec_composite', 'lr_delta_language_composite', 
                    'lr_delta_memory_composite',  'lr_delta_visuospatial_composite'
            ]].isna().sum(axis=1) == 4) & (
                crossed_mod_df[[
                    'diagnosis_2Y',
                    'diagnosis_4Y'
            ]].isna().sum(axis=1) == 2)
        )
        crossed_mod_df = crossed_mod_df.loc[~missing_mask].copy()


    end_entries = crossed_mod_df.shape[0]
    end_subjects = len(crossed_mod_df.index.get_level_values('subject_id').unique())

    # save the generated data
    cross_sectional_datasets[mod_key] = crossed_mod_df

    print('(%s) Number of subjects: %d -> %d' % (
        mod_key, init_subjects, end_subjects
    ))
    print('(%s) Number of entries: %d -> %d' % (
        mod_key, init_entries, end_entries
    ))

    display(pd.DataFrame(
        crossed_mod_df[target_vars].isna().sum() / crossed_mod_df.shape[0] * 100,
        columns=['perc_missing_values']
    ).round(2))

    print()

In [None]:
# export the generated datasets
for mod_key, mod_df in cross_sectional_datasets.items():
    mod_df.to_parquet(
        os.path.join(
            BASE_STUDY_PATH, 'mounts', 'v1', '%s_%s_cross_sectional.parquet' % (
                datetime.now().strftime('%Y%m%d'), mod_key
        ))
    )