# Data preprocessing 

## Dropped entries:
ALL ENTRIES with over a certain percentage limit missing (60%)...
Specific columns:
- COLPROT: Protocol number of measurement
- ORIGPROT: Original study protoc
- PTID?: Original study protocol
- VISCODE: Visit code (e.g. m24, covered by 'M' entry)
- SITE?: Site of study (i guess?)
- EXAMDATE: Date of measurement
- FLDSTRENG: MRI Field Strength
- FSVERSION: FreeSurfer Software Version
- IMAGEUID: LONI image ID
- ABETA_bl?: CSF ABETA at baseline
- update_stamp
- Month: Months from baseline (to nearest 6 months, based on EXAMDATEs)
- M: Months from baseline (based on VISCODE)
- Month.bl: Months from baseline based on EXAMDATEs
- Removed all baseline measurements, e.g. FAQ_bl or LDELTOTAL_B





## Altered entries:
- PTGENDER: make it boolean (0 Female, 1 Male)
- 'DX_bl', 'PTETHCAT', 'PTRACCAT', 'PTMARRY', 'DX': made into categorial values. TODO: make it one-hot encoded/dummy variables
- 
<!-- - Normalized MRI by wholebrain volume (Ventricles, Hippocampus, Entorhinal, Fusiform, MidTemp, ICV) and their corresponding baseline measurements. -->



- Find out how to handle NaN in DX.
    - If between two equal diagnostics set to same value.
    - If not what?


## Split
- Split into test/train save the model used to impute train and then imput test afterwards. Otherwise there would be data leakage!


In [1]:
import pandas as pd
import miceforest as mf
import numpy as np
import matplotlib.pyplot as plt
import os
import yaml

from sklearn.model_selection import train_test_split
from pathlib import Path
from loguru import logger

import sys
sys.path.append('../')
from Utils.Data_imputation import impute_data, store_csv
from Utils.Get_adni import get_merge, get_adni3
from Utils.util_functions import get_events
from Utils.Combine_datasets import combine_mvas_adni, post_imputation_processing

In [2]:
def impute_pure_adni(config=None):
    if(config == None):
        conf_path = Path('impute_config_adni.yaml')                   
        logger.info(f'Configuration file: {conf_path}')

        if not conf_path.exists():
            logger.warning('Configuration file not found')
            return 1    
        with open(conf_path) as f:
            config = yaml.safe_load(f)
        
    logger.info(f"Started with configurations: {config}")
    # Load the dataset
    df = get_merge(cutoff=config['missing_value_cutoff'], 
                    fill_dx_manually=config['fill_dx'], 
                    remove_dx=config['drop_empty_dx'],
                    cohort=config['cohort'],
                    prepare_sa=config['prepare_sa'])
    
    logger.info(f'Dataset dimensions: {df.shape}')
    logger.info(f'Dataset columns: {df.columns}')

    # logger.warning(f"{df.select_dtypes(include=['object', 'category']).columns}")

    if(config['prepare_sa']):
        train, test = train_test_split(df, 
                                       test_size=config['train_test_split'], 
                                       random_state=42, 
                                       stratify=df['Event'])

    else:
        # Split it into train/test split
        unique_rids = df['RID'].unique()
        # Split RIDs into training and test sets
        train_rids, test_rids = train_test_split(unique_rids, 
                                                test_size=config['train_test_split'], 
                                                random_state=42,)
                                                #stratify=df['Event'])  # Adjust test_size as needed

        # Filter the original DataFrame to create the training and test sets
        train = df[df['RID'].isin(train_rids)]
        test = df[df['RID'].isin(test_rids)]
    
    split = len(test)/(len(train)+len(test))
    logger.info(f"Train shape: {train.shape}\t Test shape: {test.shape}")
    logger.info(f"Actual test/train split: {split:.3f}")


    logger.info('Starting data imputation.')
    
    train_list, test_list, folder_name = impute_data(train, test, config)
    
    logger.info('Data imputation complete')
    logger.info('Storing files...')

    train_path = f"{folder_name}original_train.csv"
    test_path = f"{folder_name}original_test.csv"

    train.to_csv(f"{train_path}", index=False)
    test.to_csv(f"{test_path}", index=False)

    store_csv(test_list=test_list, train_list=train_list, folder_name=folder_name, config=config)
    logger.info('Files stored.')
    
    return

In [8]:
config ={
    'num_datasets': 20,
    'num_iterations': 20,
    'num_threads': 14,
    'random_state': 1991,
    'quality': False,
    'save_all_iterations': True,
    'set_mean_match_candidates': 5,
    'device': 'cpu',
    'directory_name': 'Merge_BL',
    'missing_value_cutoff': 0.6,
    'fill_dx': True,
    'drop_empty_dx': True,
    'prepare_sa': True,
    'train_test_split': 0.2,
    'cohort': ['ADNI1', 'ADNI2', 'ADNI3', 'ADNIGO'],
}

impute_pure_adni(config)

[32m2024-05-31 12:51:58.207[0m | [1mINFO    [0m | [36m__main__[0m:[36mimpute_pure_adni[0m:[36m12[0m - [1mStarted with configurations: {'num_datasets': 20, 'num_iterations': 20, 'num_threads': 14, 'random_state': 1991, 'quality': False, 'save_all_iterations': True, 'set_mean_match_candidates': 5, 'device': 'cpu', 'directory_name': 'Merge_BL', 'missing_value_cutoff': 0.6, 'fill_dx': True, 'drop_empty_dx': True, 'prepare_sa': True, 'train_test_split': 0.2, 'cohort': ['ADNI1', 'ADNI2', 'ADNI3', 'ADNIGO']}[0m
  df = pd.read_csv(datapath)
[32m2024-05-31 12:52:01.320[0m | [1mINFO    [0m | [36mUtils.Get_adni[0m:[36mget_merge[0m:[36m100[0m - [1mMissing value feature cutoff: 60.0 %[0m
[32m2024-05-31 12:52:01.320[0m | [1mINFO    [0m | [36mUtils.Get_adni[0m:[36mget_merge[0m:[36m101[0m - [1mRemoving columns with cutoff: Index(['PIB', 'FBB', 'DIGITSCOR'], dtype='object')[0m
[32m2024-05-31 12:52:01.320[0m | [1mINFO    [0m | [36mUtils.Get_adni[0m:[36mget_merg

Initialized logger with name mice 1-20
Dataset 0
1  | RAVLT_immediate | RAVLT_learning | ADAS11 | RAVLT_forgetting | AGE | RAVLT_perc_forgetting | ADAS13 | FAQ | TRABSCOR | APOE4 | ICV | Ventricles | Entorhinal | Fusiform | MidTemp | Hippocampus | FDG | EcogPtTotal | EcogPtPlan | EcogPtVisspat | EcogPtLang | EcogPtMem | EcogSPLang | EcogPtDivatt | EcogSPTotal | EcogSPMem | EcogSPPlan | EcogPtOrgan | MOCA | EcogSPVisspat | EcogSPDivatt | EcogSPOrgan | TAU | PTAU | AV45 | ABETA
2  | RAVLT_immediate | RAVLT_learning | ADAS11 | RAVLT_forgetting | AGE | RAVLT_perc_forgetting | ADAS13 | FAQ | TRABSCOR | APOE4 | ICV | Ventricles | Entorhinal | Fusiform | MidTemp | Hippocampus | FDG | EcogPtTotal | EcogPtPlan | EcogPtVisspat | EcogPtLang | EcogPtMem | EcogSPLang | EcogPtDivatt | EcogSPTotal | EcogSPMem | EcogSPPlan | EcogPtOrgan | MOCA | EcogSPVisspat | EcogSPDivatt | EcogSPOrgan | TAU | PTAU | AV45 | ABETA
3  | RAVLT_immediate | RAVLT_learning | ADAS11 | RAVLT_forgetting | AGE | RAVLT_perc_fo

[32m2024-05-31 13:34:55.970[0m | [1mINFO    [0m | [36m__main__[0m:[36mimpute_pure_adni[0m:[36m53[0m - [1mData imputation complete[0m
[32m2024-05-31 13:34:55.971[0m | [1mINFO    [0m | [36m__main__[0m:[36mimpute_pure_adni[0m:[36m54[0m - [1mStoring files...[0m
[32m2024-05-31 13:34:56.039[0m | [1mINFO    [0m | [36mUtils.Data_imputation[0m:[36mstore_csv[0m:[36m70[0m - [1mStoring dataset 0[0m
[32m2024-05-31 13:34:56.099[0m | [1mINFO    [0m | [36mUtils.Data_imputation[0m:[36mstore_csv[0m:[36m70[0m - [1mStoring dataset 1[0m
[32m2024-05-31 13:34:56.163[0m | [1mINFO    [0m | [36mUtils.Data_imputation[0m:[36mstore_csv[0m:[36m70[0m - [1mStoring dataset 2[0m
[32m2024-05-31 13:34:56.233[0m | [1mINFO    [0m | [36mUtils.Data_imputation[0m:[36mstore_csv[0m:[36m70[0m - [1mStoring dataset 3[0m
[32m2024-05-31 13:34:56.302[0m | [1mINFO    [0m | [36mUtils.Data_imputation[0m:[36mstore_csv[0m:[36m70[0m - [1mStoring dataset 4[0

# Imputed ADNI3

In [2]:
adni3 = get_adni3(verbose=True)
print(adni3.columns)
adni3_sa = get_events(adni3)
adni3_sa = adni3_sa.drop(['REMOVE', 'RID', 'VISCODE'], axis=1)
print(adni3_sa['Event'].value_counts())
config ={
    'num_datasets': 25,
    'num_iterations': 20,
    'num_threads': 14,
    'random_state': 1991,
    'train_test_split': 0.2,
    'quality': False,
    'save_all_iterations': True,
    'set_mean_match_candidates': 5,
    'device': 'cpu',
    'directory_name': 'ADNI3_BL',
}

df_train, df_test = train_test_split(adni3_sa, 
                               test_size=config['train_test_split'], 
                               random_state=42, 
                               stratify=adni3_sa['Event'])

print(df_train.shape, df_test.shape)

  df = pd.read_csv(datapath)
[32m2024-06-01 19:36:57.798[0m | [1mINFO    [0m | [36mUtils.Get_adni[0m:[36mget_merge[0m:[36m145[0m - [1mMissing value feature cutoff: 99.0 %[0m
[32m2024-06-01 19:36:57.798[0m | [1mINFO    [0m | [36mUtils.Get_adni[0m:[36mget_merge[0m:[36m146[0m - [1mRemoving columns with cutoff: Index(['PIB', 'ABETA', 'TAU', 'PTAU', 'DIGITSCOR', 'FLDSTRENG'], dtype='object')[0m
[32m2024-06-01 19:36:57.798[0m | [1mINFO    [0m | [36mUtils.Get_adni[0m:[36mget_merge[0m:[36m166[0m - [1mCategorical columns: Index(['COLPROT', 'ORIGPROT', 'PTGENDER', 'PTETHCAT', 'PTRACCAT', 'FSVERSION'], dtype='object')[0m
[32m2024-06-01 19:36:57.823[0m | [1mINFO    [0m | [36mUtils.Get_adni[0m:[36mget_merge[0m:[36m180[0m - [1mData shape = (3614, 56)[0m


ADNI3 shape before reducing: (3619, 59)
253 users selected, of which 44 have events
209 non-event subjects with MCI, of which 106 had tau meausrement
ADNI3 shape after reducing:  (841, 60)
Index(['RID', 'SITE', 'VISCODE', 'AGE', 'PTEDUCAT', 'APOE4', 'FDG', 'AV45',
       'FBB', 'CDRSB', 'ADAS11', 'ADAS13', 'ADASQ4', 'MMSE', 'RAVLT_immediate',
       'RAVLT_learning', 'RAVLT_forgetting', 'RAVLT_perc_forgetting',
       'LDELTOTAL', 'TRABSCOR', 'FAQ', 'MOCA', 'EcogPtMem', 'EcogPtLang',
       'EcogPtVisspat', 'EcogPtPlan', 'EcogPtOrgan', 'EcogPtDivatt',
       'EcogPtTotal', 'EcogSPMem', 'EcogSPLang', 'EcogSPVisspat', 'EcogSPPlan',
       'EcogSPOrgan', 'EcogSPDivatt', 'EcogSPTotal', 'Ventricles',
       'Hippocampus', 'Entorhinal', 'Fusiform', 'MidTemp', 'ICV', 'DX',
       'mPACCdigit', 'mPACCtrailsB', 'M', 'PTMARRY_Never_married',
       'PTMARRY_married', 'ORIGPROT_ADNI2', 'ORIGPROT_ADNI3',
       'ORIGPROT_ADNIGO', 'PTGENDER_Male', 'PTETHCAT_Not Hisp/Latino',
       'PTETHCAT_Unknow

In [3]:
train_list, test_list, folder_name = impute_data(df_train, df_test, config, save=True)


train_path = f"{folder_name}original_train.csv"
test_path = f"{folder_name}original_test.csv"

df_train.to_csv(f"{train_path}", index=False)
df_test.to_csv(f"{test_path}", index=False)

dfs_train = []
dfs_test = []

columns_rm = ['DX','ORIGPROT_ADNI2', 'ORIGPROT_ADNI3',
                        'ORIGPROT_ADNIGO', 'SITE']
for df_train_imp, df_test_imp in zip(train_list, test_list):
    
    df_train_imp.drop(columns_rm, axis=1, inplace=True)
    df_test_imp.drop(columns_rm, axis=1, inplace=True)
    
    dfs_train.append(df_train_imp)
    dfs_test.append(df_test_imp)

store_csv(train_list=dfs_train, test_list=dfs_test, folder_name=folder_name, config=config)

Initialized logger with name mice 1-20
Dataset 0
1  | EcogPtMem | ADAS11 | ADAS13 | ADASQ4 | RAVLT_forgetting | EcogSPTotal | EcogSPLang | EcogPtDivatt | TRABSCOR | RAVLT_perc_forgetting | CDRSB | EcogPtOrgan | EcogSPMem | MMSE | AGE | EcogSPPlan | EcogSPDivatt | MOCA | EcogSPVisspat | FAQ | EcogSPOrgan | ICV | APOE4 | Ventricles | Fusiform | Entorhinal | Hippocampus | MidTemp | FDG | AV45 | ENTORHINAL_SUVR | INFERIOR_TEMPORAL_SUVR | TAU_METAROI | FBB
2  | EcogPtMem | ADAS11 | ADAS13 | ADASQ4 | RAVLT_forgetting | EcogSPTotal | EcogSPLang | EcogPtDivatt | TRABSCOR | RAVLT_perc_forgetting | CDRSB | EcogPtOrgan | EcogSPMem | MMSE | AGE | EcogSPPlan | EcogSPDivatt | MOCA | EcogSPVisspat | FAQ | EcogSPOrgan | ICV | APOE4 | Ventricles | Fusiform | Entorhinal | Hippocampus | MidTemp | FDG | AV45 | ENTORHINAL_SUVR | INFERIOR_TEMPORAL_SUVR | TAU_METAROI | FBB
3  | EcogPtMem | ADAS11 | ADAS13 | ADASQ4 | RAVLT_forgetting | EcogSPTotal | EcogSPLang | EcogPtDivatt | TRABSCOR | RAVLT_perc_forgetting

[32m2024-06-01 19:57:36.802[0m | [1mINFO    [0m | [36mUtils.Data_imputation[0m:[36mstore_csv[0m:[36m70[0m - [1mStoring dataset 0[0m
[32m2024-06-01 19:57:36.811[0m | [1mINFO    [0m | [36mUtils.Data_imputation[0m:[36mstore_csv[0m:[36m70[0m - [1mStoring dataset 1[0m
[32m2024-06-01 19:57:36.826[0m | [1mINFO    [0m | [36mUtils.Data_imputation[0m:[36mstore_csv[0m:[36m70[0m - [1mStoring dataset 2[0m
[32m2024-06-01 19:57:36.842[0m | [1mINFO    [0m | [36mUtils.Data_imputation[0m:[36mstore_csv[0m:[36m70[0m - [1mStoring dataset 3[0m
[32m2024-06-01 19:57:36.858[0m | [1mINFO    [0m | [36mUtils.Data_imputation[0m:[36mstore_csv[0m:[36m70[0m - [1mStoring dataset 4[0m
[32m2024-06-01 19:57:36.875[0m | [1mINFO    [0m | [36mUtils.Data_imputation[0m:[36mstore_csv[0m:[36m70[0m - [1mStoring dataset 5[0m
[32m2024-06-01 19:57:36.887[0m | [1mINFO    [0m | [36mUtils.Data_imputation[0m:[36mstore_csv[0m:[36m70[0m - [1mStoring datase

# Impute MVAS+ADNI-A

In [15]:
merge, extra_mci, extra_B = combine_mvas_adni(verbose=False)
print("Values used for imputation:")
print(merge.shape, extra_mci.shape, extra_B.shape)
print(merge['Event'].value_counts())

config ={
    'num_datasets': 25,
    'num_iterations': 20,
    'num_threads': 14,
    'random_state': 1991,
    'train_test_split': 0.2,
    'quality': False,
    'save_all_iterations': True,
    'set_mean_match_candidates': 5,
    'device': 'cpu',
    'directory_name': 'MVAS+ADNI-A',
}

  df = pd.read_csv(datapath)
[32m2024-06-01 23:16:37.677[0m | [1mINFO    [0m | [36mUtils.Get_adni[0m:[36mget_merge[0m:[36m145[0m - [1mMissing value feature cutoff: 99.0 %[0m
[32m2024-06-01 23:16:37.677[0m | [1mINFO    [0m | [36mUtils.Get_adni[0m:[36mget_merge[0m:[36m146[0m - [1mRemoving columns with cutoff: Index(['PIB', 'ABETA', 'TAU', 'PTAU', 'DIGITSCOR', 'FLDSTRENG'], dtype='object')[0m
[32m2024-06-01 23:16:37.692[0m | [1mINFO    [0m | [36mUtils.Get_adni[0m:[36mget_merge[0m:[36m166[0m - [1mCategorical columns: Index(['COLPROT', 'ORIGPROT', 'PTGENDER', 'PTETHCAT', 'PTRACCAT', 'FSVERSION'], dtype='object')[0m
[32m2024-06-01 23:16:37.724[0m | [1mINFO    [0m | [36mUtils.Get_adni[0m:[36mget_merge[0m:[36m180[0m - [1mData shape = (3614, 56)[0m


Values used for imputation:
(291, 107) (38, 106) (113, 106)
Event
False    235
True      56
Name: count, dtype: int64


In [16]:
extra_mci['Event'] = 0
extra_B['Event'] = 0

df_train, df_test = train_test_split(merge, 
                               test_size=config['train_test_split'], 
                               random_state=42, 
                               stratify=merge['Event'])


extra_mci_train = extra_mci[extra_mci['RID'].isin(df_train['RID'])]
df_train = pd.concat([df_train, extra_mci_train], ignore_index=True)

df_train['Event'] = df_train['Event'].astype(bool)

print(df_train.shape, df_test.shape)

(262, 107) (59, 107)


In [17]:
train_list, test_list, folder_name = impute_data(df_train, df_test, config, save=True)

train_path = f"{folder_name}original_train.csv"
test_path = f"{folder_name}original_test.csv"

df_train.to_csv(f"{train_path}", index=False)
df_test.to_csv(f"{test_path}", index=False)

dfs_train = []
dfs_test = []

for df_train_imp, df_test_imp in zip(train_list, test_list):
    
    df_tr, df_te = post_imputation_processing(df_train_imp, df_test_imp)
    df_tr = np.round(df_tr, 5)
    df_te = np.round(df_te, 5)
    
    dfs_train.append(df_tr)
    dfs_test.append(df_te)

store_csv(train_list=dfs_train, test_list=dfs_test, folder_name=folder_name, config=config)

Initialized logger with name mice 1-20
Dataset 0
1  | MMSE | CDRSB | AGE | MOCA | APOE4 | RAVLT_immediate | LDELTOTAL | ADAS13 | ADASQ4 | ADAS11 | FAQ | TAU_METAROI | INFERIOR_TEMPORAL_SUVR | ENTORHINAL_SUVR | Volume_mm3_parietal_gm | PiB_SUVR_parietal_wm | Volume_mm3_parietal_wm | PiB_SUVR_putamen | Volume_mm3_putamen | Volume_mm3_temporal_wm | Volume_mm3_temporal_gm | PiB_SUVR_temporal_wm | PiB_SUVR_thalamus | Volume_mm3_thalamus | PiB_SUVR_parietal_gm | PiB_SUVR_temporal_gm | Volume_mm3_occipital_wm | PiB_SUVR_caudate | PiB_SUVR_frontal_gm | PiB_SUVR_hippocampus | PiB_SUVR_frontal_wm | Volume_mm3_frontal_gm | PiB_SUVR_occipital_wm | Volume_mm3_occipital_gm | PiB_SUVR_occipital_gm | Volume_mm3_caudate | Volume_mm3_frontal_wm | Volume_mm3_hippocampus | SEPWI_CBF_hippocampus | SEPWI_CBV_occipital_wm | SEPWI_CTH_thalamus | SEPWI_CMRO2_thalamus | SEPWI_CBV_thalamus | SEPWI_CBF_thalamus | SEPWI_CBV_hippocampus | SEPWI_CMRO2_hippocampus | SEPWI_OEF_thalamus | SEPWI_OEF_frontal_wm | SEPWI_C

[32m2024-06-02 00:14:06.382[0m | [1mINFO    [0m | [36mUtils.Data_imputation[0m:[36mstore_csv[0m:[36m70[0m - [1mStoring dataset 0[0m
[32m2024-06-02 00:14:06.413[0m | [1mINFO    [0m | [36mUtils.Data_imputation[0m:[36mstore_csv[0m:[36m70[0m - [1mStoring dataset 1[0m
[32m2024-06-02 00:14:06.435[0m | [1mINFO    [0m | [36mUtils.Data_imputation[0m:[36mstore_csv[0m:[36m70[0m - [1mStoring dataset 2[0m
[32m2024-06-02 00:14:06.460[0m | [1mINFO    [0m | [36mUtils.Data_imputation[0m:[36mstore_csv[0m:[36m70[0m - [1mStoring dataset 3[0m
[32m2024-06-02 00:14:06.484[0m | [1mINFO    [0m | [36mUtils.Data_imputation[0m:[36mstore_csv[0m:[36m70[0m - [1mStoring dataset 4[0m
[32m2024-06-02 00:14:06.504[0m | [1mINFO    [0m | [36mUtils.Data_imputation[0m:[36mstore_csv[0m:[36m70[0m - [1mStoring dataset 5[0m
[32m2024-06-02 00:14:06.518[0m | [1mINFO    [0m | [36mUtils.Data_imputation[0m:[36mstore_csv[0m:[36m70[0m - [1mStoring datase

# Impute MVAS-ADNI-B

In [18]:
merge, extra_mci, extra_B = combine_mvas_adni(verbose=False)
print("Values used for imputation:")
print(merge.shape, extra_mci.shape, extra_B.shape)
print(merge['Event'].value_counts())

config ={
    'num_datasets': 25,
    'num_iterations': 20,
    'num_threads': 14,
    'random_state': 1991,
    'train_test_split': 0.2,
    'quality': False,
    'save_all_iterations': True,
    'set_mean_match_candidates': 5,
    'device': 'cpu',
    'directory_name': 'MVAS+ADNI-B',
}

  df = pd.read_csv(datapath)
[32m2024-06-02 00:14:08.381[0m | [1mINFO    [0m | [36mUtils.Get_adni[0m:[36mget_merge[0m:[36m145[0m - [1mMissing value feature cutoff: 99.0 %[0m
[32m2024-06-02 00:14:08.381[0m | [1mINFO    [0m | [36mUtils.Get_adni[0m:[36mget_merge[0m:[36m146[0m - [1mRemoving columns with cutoff: Index(['PIB', 'ABETA', 'TAU', 'PTAU', 'DIGITSCOR', 'FLDSTRENG'], dtype='object')[0m
[32m2024-06-02 00:14:08.416[0m | [1mINFO    [0m | [36mUtils.Get_adni[0m:[36mget_merge[0m:[36m166[0m - [1mCategorical columns: Index(['COLPROT', 'ORIGPROT', 'PTGENDER', 'PTETHCAT', 'PTRACCAT', 'FSVERSION'], dtype='object')[0m
[32m2024-06-02 00:14:08.437[0m | [1mINFO    [0m | [36mUtils.Get_adni[0m:[36mget_merge[0m:[36m180[0m - [1mData shape = (3614, 56)[0m


Values used for imputation:
(291, 107) (38, 106) (113, 106)
Event
False    235
True      56
Name: count, dtype: int64


In [19]:
extra_mci['Event'] = 0
extra_B['Event'] = 0

df_train, df_test = train_test_split(merge, 
                               test_size=config['train_test_split'], 
                               random_state=42, 
                               stratify=merge['Event'])


extra_mci_train = extra_mci[extra_mci['RID'].isin(df_train['RID'])]
df_train = pd.concat([df_train, extra_mci_train], ignore_index=True)

#MVAS-ADNI-B:
df_train = pd.concat([df_train, extra_B], ignore_index=True)

df_train['Event'] = df_train['Event'].astype(bool)

print(df_train.shape, df_test.shape)

(375, 107) (59, 107)


In [20]:
train_list, test_list, folder_name = impute_data(df_train, df_test, config, save=True)

train_path = f"{folder_name}original_train.csv"
test_path = f"{folder_name}original_test.csv"

df_train.to_csv(f"{train_path}", index=False)
df_test.to_csv(f"{test_path}", index=False)

dfs_train = []
dfs_test = []

for df_train_imp, df_test_imp in zip(train_list, test_list):
    
    df_tr, df_te = post_imputation_processing(df_train_imp, df_test_imp)
    df_tr = np.round(df_tr, 5)
    df_te = np.round(df_te, 5)
    
    dfs_train.append(df_tr)
    dfs_test.append(df_te)

store_csv(train_list=dfs_train, test_list=dfs_test, folder_name=folder_name, config=config)

Initialized logger with name mice 1-20
Dataset 0
1  | AGE | MMSE | APOE4 | PTEDUCAT | CDRSB | MOCA | LDELTOTAL | RAVLT_immediate | ADASQ4 | ADAS13 | ADAS11 | FAQ | Volume_mm3_temporal_gm | Volume_mm3_putamen | Volume_mm3_parietal_gm | Volume_mm3_parietal_wm | Volume_mm3_temporal_wm | Volume_mm3_occipital_gm | Volume_mm3_hippocampus | Volume_mm3_frontal_wm | Volume_mm3_thalamus | Volume_mm3_frontal_gm | Volume_mm3_caudate | Volume_mm3_occipital_wm | ENTORHINAL_SUVR | INFERIOR_TEMPORAL_SUVR | TAU_METAROI | PiB_SUVR_temporal_wm | PiB_SUVR_temporal_gm | PiB_SUVR_putamen | PiB_SUVR_parietal_gm | PiB_SUVR_parietal_wm | PiB_SUVR_thalamus | PiB_SUVR_caudate | PiB_SUVR_hippocampus | PiB_SUVR_occipital_wm | PiB_SUVR_frontal_wm | PiB_SUVR_frontal_gm | PiB_SUVR_occipital_gm | SEPWI_CTH_occipital_wm | SEPWI_CBF_hippocampus | SEPWI_CBV_hippocampus | SEPWI_CMRO2_hippocampus | SEPWI_CTH_hippocampus | SEPWI_CTH_thalamus | SEPWI_CMRO2_thalamus | SEPWI_CBV_thalamus | SEPWI_CBF_thalamus | SEPWI_OEF_hippoc

[32m2024-06-02 01:39:56.768[0m | [1mINFO    [0m | [36mUtils.Data_imputation[0m:[36mstore_csv[0m:[36m70[0m - [1mStoring dataset 0[0m
[32m2024-06-02 01:39:56.789[0m | [1mINFO    [0m | [36mUtils.Data_imputation[0m:[36mstore_csv[0m:[36m70[0m - [1mStoring dataset 1[0m
[32m2024-06-02 01:39:56.813[0m | [1mINFO    [0m | [36mUtils.Data_imputation[0m:[36mstore_csv[0m:[36m70[0m - [1mStoring dataset 2[0m
[32m2024-06-02 01:39:56.839[0m | [1mINFO    [0m | [36mUtils.Data_imputation[0m:[36mstore_csv[0m:[36m70[0m - [1mStoring dataset 3[0m
[32m2024-06-02 01:39:56.865[0m | [1mINFO    [0m | [36mUtils.Data_imputation[0m:[36mstore_csv[0m:[36m70[0m - [1mStoring dataset 4[0m
[32m2024-06-02 01:39:56.890[0m | [1mINFO    [0m | [36mUtils.Data_imputation[0m:[36mstore_csv[0m:[36m70[0m - [1mStoring dataset 5[0m
[32m2024-06-02 01:39:56.919[0m | [1mINFO    [0m | [36mUtils.Data_imputation[0m:[36mstore_csv[0m:[36m70[0m - [1mStoring datase