Notebook used to detect artifacts in the processed images.

In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime
from sklearn.ensemble import IsolationForest
from sklearn.decomposition import PCA
from scipy import stats

In [None]:
# path where all the neuroimaging data is located
PATH_TO_NEUROIMAGING_DATA = os.path.join('..', 'data', 'neuroimaging', 'v1')

# path to the data generated using different pipelines, the key indicates pipeline priority 
# (lower = most priority)
PATH_TO_MRI = {
    0: os.path.join(PATH_TO_NEUROIMAGING_DATA, '20240428_MRI_intermodality_v0.parquet'),
}
PATH_TO_FDG = {
    0: os.path.join(PATH_TO_NEUROIMAGING_DATA, '20240428_FDG_intermodality_v0.parquet'),
}
PATH_TO_AMY = {
    0: os.path.join(PATH_TO_NEUROIMAGING_DATA, '20240428_AMY_intermodality_v0.parquet'),
}

# prefix that will be added to the generated parquets
DATE_KEY = '20240428'

# Parameter indicating whether to calculate statistics used to eliminate outliers 
# on statistics calculated on samples within the percentile delimited by this 
# parameter. (parameter selected ad-hoc for each modality)
STATS_CUTOFF_VAL = 0.05

# dpi used for image representation
DPI = 100

# number of jobs used to adjust the algorithm
N_JOBS = 16

# random seed
SEED = 1997

In [None]:
def detectOutliers(df: pd.DataFrame, title: str, cut_off: float = None):
    """ Function used to detect outliers based on the Isolation Forest algorithm.
    
    This function will return the index of the subjects classified as outliers.
    """
    iforest = IsolationForest(
        n_estimators=600,
        max_samples=0.75,
        contamination='auto',
        max_features=0.75,
        bootstrap=False,
        n_jobs=N_JOBS,
        random_state=SEED,
    )

    # fit isolation forest algorithm
    iforest.fit(df.values)
    
    # get scores and calculate the cut-off considering 3 std from the trimmed mean
    scores = iforest.score_samples(df.values)
    
    if cut_off is None:
        cut_off = stats.trim_mean(scores, STATS_CUTOFF_VAL) - 3 * stats.mstats.trimmed_std(scores, (STATS_CUTOFF_VAL, STATS_CUTOFF_VAL))

    # display statistics
    fig, axes = plt.subplots(1, 2, figsize=(13, 4))
    fig.set_dpi(DPI)
    
    # --- plot scores distribution
    n_count, _, _ = axes[0].hist(scores, bins=50, color='#293462')
    axes[0].axvline(cut_off, lw=2, color='#F24C4C', label='cut-off')
    
    axes[0].spines['top'].set_visible(False)
    axes[0].spines['right'].set_visible(False)
    
    axes[0].fill_between(
        [min(scores) * 1.1, cut_off],
        [max(n_count) * 1.1, max(n_count) * 1.1],
        alpha=0.3,
        color='#F24C4C'
    )
    axes[0].set_ylim(0, max(n_count))
    axes[0].set_xlim(min(scores)*1.1, max(scores)*0.9)
    axes[0].set_xlabel('Score', size=13)
    axes[0].set_ylabel('Number of samples', size=13)
    axes[0].set_title('Outlier score (%s)' % title, size=15, pad=15)
    
    # --- plot PCA projection
    zscore_data = (df - df.mean()) / df.std()
    pca = PCA(
        n_components=2,
        random_state=SEED
    ).fit(zscore_data)
    pca_emb = pca.transform(zscore_data)
    
    axes[1].scatter(
        pca_emb[scores >= cut_off, 0],
        pca_emb[scores >= cut_off, 1],
        s=20,
        color='#293462'
    )
    axes[1].scatter(
        pca_emb[scores < cut_off, 0],
        pca_emb[scores < cut_off, 1],
        s=20,
        color='#F24C4C'
    )
    axes[1].set_xlabel('PC 1', size=12)
    axes[1].set_ylabel('PC 2', size=12)
    axes[1].set_title('PCA projection, EV %.3f (%s)' % (
        np.cumsum(pca.explained_variance_ratio_)[-1], title), size=15, pad=15)
    
    for pos in ['right', 'top', 'left', 'bottom']:
        axes[1].spines[pos].set_visible(False)
    
    # --- common layout
    for ax in axes.flatten():
        ax.grid(alpha=0.1, color='black')
    
    fig.suptitle(
        'Number of outliers {}: {} ({:.2f}%)'.format(
            title, 
            (scores < cut_off).sum(),
            (scores < cut_off).sum() / len(scores) * 100
        ), 
        size=17, y=1.15)
    plt.show()

    return df.loc[scores < cut_off].index


def filterErrors(df: pd.DataFrame):
    """ Filter out those examples that are 5 times above the interquartile range 
    in terms of the sum of all variables. These values can be safely classified as outliers. """

    # filter values with missing values
    na_mask = df.isna().sum(axis=1) > 1
    if na_mask.any():
        print('Number of errors (missing values generated): %d' % na_mask.sum())
    
    median = df.median(axis=0)
    q25 = df.quantile(0.25, axis=0)
    q75 = df.quantile(0.75, axis=0)
    iqr_range = 2.5 * (q75 - q25)

    err_mask = ((df < (median - iqr_range)) | (df > (median + iqr_range))).sum(axis=1) > (df.shape[1] * 0.2)

    print('Number of errors: %d' % err_mask.sum())

    return df.loc[err_mask | na_mask].index


def mergeDataFrames(source_df: pd.DataFrame, reference_df: pd.DataFrame) -> pd.DataFrame:
    """ Merge outlier information from source and reference dataframe. """

    # avoid inplace modifications
    source_df = source_df.copy()
    reference_df = reference_df.copy()

    # save input index names
    index_names = source_df.index.names

    # merge the information
    source_df = source_df.reset_index(['date', 'acquisition_id'])
    reference_df = reference_df.reset_index(['date', 'acquisition_id'])
    merge_df = source_df.join(reference_df, how='inner', rsuffix='_reference')  

    # select the entries closer in time
    merge_df['days_diff'] = (merge_df['date'] - merge_df['date_reference']).dt.days.abs()
    merge_df = merge_df\
        .reset_index().set_index(['subject_id', 'date', 'days_diff']).sort_index()\
        .groupby(['subject_id', 'date']).nth(0)\
        .reset_index().set_index(index_names)[['outlier', 'outlier_reference']]

    assert merge_df.shape[0] == source_df.shape[0]

    return merge_df



# MRI outliers

In [None]:
# load MRI data
mri = []
for k, file in PATH_TO_MRI.items():
    data_ = pd.read_parquet(file)
    index_names = list(data_.index.names) 
    data_ = data_.reset_index()
    data_['pipeline'] = k
    data_ = data_.set_index(index_names + ['pipeline']).sort_index()
    mri.append(data_)
mri = pd.concat(mri, axis=0)

print('Input data shape:', mri.shape)

# filter errors based on simple statistics
mri_filtered = filterErrors(mri)

# apply outlier detection algorithm
mri_outliers = detectOutliers(mri.loc[~mri.index.isin(mri_filtered)], 'MRI')

In [None]:
# create a dataframe for discard FDG and Amyloid images based on MRI outlier information
mri_filtered_df = pd.DataFrame(
    np.zeros(shape=(len(mri_filtered))), 
    index=mri_filtered, 
    columns=['mri_placeholder'])
mri_outliers_df = pd.DataFrame(
    np.zeros(shape=(len(mri_outliers))), 
    index=mri_outliers, 
    columns=['mri_placeholder'])
mri_outliers_df = pd.concat([mri_filtered_df, mri_outliers_df], axis=0)
mri_outliers_df = mri_outliers_df.reset_index(['acquisition_id', 'pipeline']).drop(columns=['acquisition_id', 'pipeline'])
mri_outliers_df.shape

# FDG-PET outliers

In [None]:
# load FDG data
fdg = []
for k, file in PATH_TO_FDG.items():
    data_ = pd.read_parquet(file)
    index_names = list(data_.index.names) 
    data_ = data_.reset_index()
    data_['pipeline'] = k
    data_ = data_.set_index(index_names + ['pipeline']).sort_index()
    fdg.append(data_)
fdg = pd.concat(fdg, axis=0)

# select mean and std values
fdg = fdg[[c for c in fdg.columns if c.endswith('mean') or c.endswith('std')]].copy()
print('Input data shape:', fdg.shape)

# filter errors based on simple statistics
fdg_filtered = filterErrors(fdg)

# apply outlier detection algorithm
fdg_outliers = detectOutliers(fdg.loc[~fdg.index.isin(fdg_filtered)], 'FDG-PET')

# AV45-PET outliers

In [None]:
# load AMY data
amy = []
for k, file in PATH_TO_AMY.items():
    data_ = pd.read_parquet(file)
    index_names = list(data_.index.names) 
    data_ = data_.reset_index()
    data_['pipeline'] = k
    data_ = data_.set_index(index_names + ['pipeline']).sort_index()
    amy.append(data_)
amy = pd.concat(amy, axis=0)

# select mean and std values
amy = amy[[c for c in amy.columns if c.endswith('mean') or c.endswith('std')]].copy()
print('Input data shape:', amy.shape)

# filter errors based on simple statistics
amy_filtered = filterErrors(amy)

# apply outlier detection algorithm
amy_outliers = detectOutliers(amy.loc[~amy.index.isin(amy_filtered)], 'Amyloid-PET')



# Export the generated information

In [None]:
# create outlier info dataframes
mri_outlier_info = pd.DataFrame(
    np.zeros(shape=len(mri)).astype(int),
    index=mri.index,
    columns=['outlier']
)

fdg_outlier_info = pd.DataFrame(
    np.zeros(shape=len(fdg)).astype(int),
    index=fdg.index,
    columns=['outlier']
)

amy_outlier_info = pd.DataFrame(
    np.zeros(shape=len(amy)).astype(int),
    index=amy.index,
    columns=['outlier']
)

# mark outliers
mri_outlier_info.loc[
    mri_outlier_info.index.isin(mri_filtered) | 
    mri_outlier_info.index.isin(mri_outliers), 'outlier'
] = 1
fdg_outlier_info.loc[
    fdg_outlier_info.index.isin(fdg_filtered) | 
    fdg_outlier_info.index.isin(fdg_outliers), 'outlier'
] = 1
amy_outlier_info.loc[
    amy_outlier_info.index.isin(amy_filtered) | 
    amy_outlier_info.index.isin(amy_outliers), 'outlier'
] = 1

# display outlier information
for name, df in [
    ('mri', mri_outlier_info), 
    ('fdg', fdg_outlier_info), 
    ('amy', amy_outlier_info)]:

    print('Outliers in {}: {} ({:.2f} %)'.format(
        name,
        df['outlier'].sum(),
        df['outlier'].mean() * 100))

In [None]:
# ignore pipeline information (only one pipeline has been used)
mri_outlier_info = mri_outlier_info.reset_index('pipeline').drop(columns=['pipeline'])
fdg_outlier_info = fdg_outlier_info.reset_index('pipeline').drop(columns=['pipeline'])
amy_outlier_info = amy_outlier_info.reset_index('pipeline').drop(columns=['pipeline'])

# add MRI outlier information to PET dataframes
fdg_outlier_info = mergeDataFrames(fdg_outlier_info, mri_outlier_info)
amy_outlier_info = mergeDataFrames(amy_outlier_info, mri_outlier_info)

print('Percentage of outliers in MRI being also outliers in FDG-PET: {:.2f} %'.format(
    float(
        ((fdg_outlier_info['outlier'] == 1) & (fdg_outlier_info['outlier_reference'] == 1)).sum() / 
        (fdg_outlier_info['outlier']).sum() * 100
    )
))

print('Percentage of outliers in MRI being also outliers in amyloid PET: {:.2f} %'.format(
    float(
        ((amy_outlier_info['outlier'] == 1) & (amy_outlier_info['outlier_reference'] == 1)).sum() / 
        (amy_outlier_info['outlier']).sum() * 100
    )
))

# select PET images with marked as outliers in MRI as outliers
fdg_outlier_info['outlier'] = (fdg_outlier_info.sum(axis=1) > 0).astype(int)
amy_outlier_info['outlier'] = (amy_outlier_info.sum(axis=1) > 0).astype(int)

# remove auxiliary columns
fdg_outlier_info = fdg_outlier_info.drop(columns=['outlier_reference'])
amy_outlier_info = amy_outlier_info.drop(columns=['outlier_reference'])

# file with the indices associated with neuroimaging errors in the amyloid data
# manually reviewed
manual_amy_outliers = pd.read_csv(
    os.path.join(
        '..', '..', 'data', 'metadata', 'neuroimaging',
        '20241106_reviewed_amyloid_errors.csv'
    )
)
manual_amy_outliers['date'] = pd.to_datetime(manual_amy_outliers['date'])
manual_amy_outliers = manual_amy_outliers.set_index(['subject_id', 'date', 'acquisition_id'])

print('Number of errors manually reviewed for amyloid information')
amy_outlier_info.loc[manual_amy_outliers.index, 'outlier'] = 1

# display outlier information
for name, df in [
    ('mri', mri_outlier_info), 
    ('fdg', fdg_outlier_info), 
    ('amy', amy_outlier_info)]:

    print('Outliers in {}: {} ({:.2f} %)'.format(
        name,
        df['outlier'].sum(),
        df['outlier'].mean() * 100))
    
mri_outlier_info.shape, fdg_outlier_info.shape, amy_outlier_info.shape

In [None]:
# export the generated dataframes
curr_date = datetime.now().strftime('%Y%m%d')
amy_outlier_info.to_parquet(
    os.path.join(PATH_TO_NEUROIMAGING_DATA, '%s_AMY_intermodality_v0_outliers_generated%s.parquet' % (DATE_KEY, curr_date)))
mri_outlier_info.to_parquet(
    os.path.join(PATH_TO_NEUROIMAGING_DATA, '%s_MRI_intermodality_v0_outliers_generated%s.parquet' % (DATE_KEY, curr_date)))
fdg_outlier_info.to_parquet(
    os.path.join(PATH_TO_NEUROIMAGING_DATA, '%s_FDG_intermodality_v0_outliers_generated%s.parquet' % (DATE_KEY, curr_date)))