## Sorting data to cohorts
In this file I sort the data based on the MRIT1_elim.csv. I will create 4 cohorts:  <br>
cohort name:         (UDSD, ALZD)<br>
normal cognition:    (1, 0)<br>
alzheimer's disease: (4, 1)<br>
mci on way to alzd:  (3, 1)<br>
transition cohort:   every transitions, for exaple (3, 1) ---> (4, 1)<br>

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [2]:
loadPath = "../data/"
writePath = "../../NACC_data/sorted_cohorts/"
savePath = "../results/"

Reading the .csv file.

In [3]:
df = pd.read_csv(loadPath + 'MRIT1_elim.csv')

In [4]:
print(df['NACCID'].nunique())

898


Creating 4 empty arrays (4 groups), that will store NACCIDS based on given criteria.

In [5]:
nc = []                 # normal cognition
mci = []                # mci
alz = []                # alzheimer's disease
trans = []              # transitional array

In [6]:
for naccid in df['NACCID'].unique():

    patient = df[df['NACCID'] == naccid]

    a = len(patient)

    udsd = [row['NACCUDSD'] for idx, row in patient.iterrows()]
    alzd = [row['NACCALZD'] for idx, row in patient.iterrows()]

    if (udsd.count(1), alzd.count(8)) == (a, a):
        nc.append(naccid)

    elif (udsd.count(4), alzd.count(1)) == (a, a):
        alz.append(naccid)

    elif (udsd.count(3), alzd.count(1)) == (a, a):
        mci.append(naccid)

    else:
        trans.append(naccid)


IMPORTANT: I deleted 3 patients by hand in trans, since they were extreme cases WLOG.

In [7]:
print(len(nc) + len(mci) + len(alz) + len(trans))
print(len(alz))

898
64


Filtering the data.

In [8]:
df_nc = df[df['NACCID'].isin(nc)]
df_mci = df[df['NACCID'].isin(mci)]
df_alz = df[df['NACCID'].isin(alz)]
df_trans = df[df['NACCID'].isin(trans)]

Sanity check.

In [9]:
print(df_nc['NACCID'].nunique() + df_mci['NACCID'].nunique() +
      df_alz['NACCID'].nunique() + df_trans['NACCID'].nunique())

898


In [10]:
# df_nc.to_csv(writePath + 'nc.csv', index=False)
# df_mci.to_csv(writePath + 'mci.csv', index=False)
# df_alz.to_csv(writePath + 'alzd.csv', index=False)
# df_trans.to_csv(writePath + 'trans.csv', index=False)


# print("Data writing to CSV complete!")

### Sorting to cohorts
I will read the .csv files created in this document and store patients from directory "within1year" to 4 different cohorts.

In [28]:
import os
import shutil
import itertools

In [29]:
moveFrom = "../../NACC_data/within1yr/nifti/"
moveTo = "../../NACC_data/sorted_cohorts/"

Loading 4 categories.

In [30]:
df_nc = pd.read_csv(writePath + 'nc.csv')
df_mci = pd.read_csv(writePath + 'mci.csv')
df_alz = pd.read_csv(writePath + 'alzd.csv')
df_trans = pd.read_csv(writePath + 'trans.csv')

Getting the filenames stored in those 4 .csv files.

In [31]:
nc_filenames = list(df_nc['NACCMRFI'])
mci_filenames = list(df_mci['NACCMRFI'])
alz_filenames = list(df_alz['NACCMRFI'])
trans_filenames = list(df_trans['NACCMRFI'])

In [32]:
print(nc_filenames)

['mri3468.zip', 'mri8604.zip', 'NACC010716_13122110752436602730000015033012065479600000007.zip', 'NACC010716_13122110752436602730000016050312073838400000001.zip', 'NACC585174_13122110752436602730000015062212225349900000001.zip', 'NACC585174_13122110752436602730000016110813524956900000010.zip', 'NACC292215_13122110752436602730000015081912152981500000004.zip', 'NACC292215_13122110752436602730000016110111382417400000004.zip', 'NACC877188_13122110752436602730000015072412182163700000007.zip', 'NACC877188_13122110752436602730000016113014261728200000007.zip', 'NACC905491_13122110752436602730000015091112012170000000010.zip', 'NACC905491_13122110752436602730000017092515563434600001543.zip', 'NACC291711_13122110752436602730000015103011565935800000004.zip', 'NACC291711_13122110752436602730000017111020485761400002570.zip', 'NACC840331_13122110752436602730000015102912262044100000007.zip', 'NACC840331_13122110752436602730000017080311514709200000001.zip', 'NACC478776_131221107524366027300000160621120

Handling the naming convention.

In [33]:
nc_filenames = [file.replace('.zip', 'ni') for file in nc_filenames]
mci_filenames = [file.replace('.zip', 'ni') for file in mci_filenames]
alz_filenames = [file.replace('.zip', 'ni') for file in alz_filenames]
trans_filenames = [file.replace('.zip', 'ni') for file in trans_filenames]

In [34]:
print(nc_filenames)

['mri3468ni', 'mri8604ni', 'NACC010716_13122110752436602730000015033012065479600000007ni', 'NACC010716_13122110752436602730000016050312073838400000001ni', 'NACC585174_13122110752436602730000015062212225349900000001ni', 'NACC585174_13122110752436602730000016110813524956900000010ni', 'NACC292215_13122110752436602730000015081912152981500000004ni', 'NACC292215_13122110752436602730000016110111382417400000004ni', 'NACC877188_13122110752436602730000015072412182163700000007ni', 'NACC877188_13122110752436602730000016113014261728200000007ni', 'NACC905491_13122110752436602730000015091112012170000000010ni', 'NACC905491_13122110752436602730000017092515563434600001543ni', 'NACC291711_13122110752436602730000015103011565935800000004ni', 'NACC291711_13122110752436602730000017111020485761400002570ni', 'NACC840331_13122110752436602730000015102912262044100000007ni', 'NACC840331_13122110752436602730000017080311514709200000001ni', 'NACC478776_13122110752436602730000016062112051245000000016ni', 'NACC478776_1

In [35]:
print(len(nc_filenames))
print(len(mci_filenames))
print(len(alz_filenames))
print(len(trans_filenames))

1519
96
142
369


Moving the files to cohorts.

In [37]:
for i, j, k, l in itertools.zip_longest(nc_filenames, mci_filenames, alz_filenames, trans_filenames, fillvalue=None):
    
    #print(moveFrom + i)
    #Move files if they exist (i.e., not None)
    if i is not None:
        shutil.move(moveFrom + i, moveTo + 'NC/')
    if j is not None:
        shutil.move(moveFrom + j, moveTo + 'MCI/')
    if k is not None:
        shutil.move(moveFrom + k, moveTo + 'ALZD/')
    if l is not None:
        shutil.move(moveFrom + l, moveTo + 'TRANS/')

# SUCCESS :)

### Deleting the files (ran only once, hence the commented parts of code)
I will delete all the files in my folder "within1yr" that did not make it to the final selection.
<br>
First I need to add "ni" to filenames in the lists, since that the naming convention in the folder.

In [None]:
dirPath = '../../NACC_data/within1yr/nifti'
dirPath_extract = '../../NACC_data/within1yr/nifti_extract'

In [221]:
# Add "ni" before ".zip" for each element
nc_filenames = [file_name.replace('.zip', 'ni.zip') for file_name in nc_filenames]
mci_filenames = [file_name.replace('.zip', 'ni.zip') for file_name in mci_filenames]
alz_filenames = [file_name.replace('.zip', 'ni.zip') for file_name in alz_filenames]
trans_filenames = [file_name.replace('.zip', 'ni.zip') for file_name in trans_filenames]

In [222]:
print(nc_filenames)

['mri3468ni.zip', 'mri8604ni.zip', 'NACC010716_13122110752436602730000015033012065479600000007ni.zip', 'NACC010716_13122110752436602730000016050312073838400000001ni.zip', 'NACC585174_13122110752436602730000015062212225349900000001ni.zip', 'NACC585174_13122110752436602730000016110813524956900000010ni.zip', 'NACC292215_13122110752436602730000015081912152981500000004ni.zip', 'NACC292215_13122110752436602730000016110111382417400000004ni.zip', 'NACC877188_13122110752436602730000015072412182163700000007ni.zip', 'NACC877188_13122110752436602730000016113014261728200000007ni.zip', 'NACC905491_13122110752436602730000015091112012170000000010ni.zip', 'NACC905491_13122110752436602730000017092515563434600001543ni.zip', 'NACC291711_13122110752436602730000015103011565935800000004ni.zip', 'NACC291711_13122110752436602730000017111020485761400002570ni.zip', 'NACC840331_13122110752436602730000015102912262044100000007ni.zip', 'NACC840331_13122110752436602730000017080311514709200000001ni.zip', 'NACC478776_1

Merging the lists together.

In [223]:
file_list = nc_filenames 
file_list.extend(mci_filenames)
file_list.extend(alz_filenames)
file_list.extend(trans_filenames)

In [224]:
print(file_list)
print(len(file_list))

['mri3468ni.zip', 'mri8604ni.zip', 'NACC010716_13122110752436602730000015033012065479600000007ni.zip', 'NACC010716_13122110752436602730000016050312073838400000001ni.zip', 'NACC585174_13122110752436602730000015062212225349900000001ni.zip', 'NACC585174_13122110752436602730000016110813524956900000010ni.zip', 'NACC292215_13122110752436602730000015081912152981500000004ni.zip', 'NACC292215_13122110752436602730000016110111382417400000004ni.zip', 'NACC877188_13122110752436602730000015072412182163700000007ni.zip', 'NACC877188_13122110752436602730000016113014261728200000007ni.zip', 'NACC905491_13122110752436602730000015091112012170000000010ni.zip', 'NACC905491_13122110752436602730000017092515563434600001543ni.zip', 'NACC291711_13122110752436602730000015103011565935800000004ni.zip', 'NACC291711_13122110752436602730000017111020485761400002570ni.zip', 'NACC840331_13122110752436602730000015102912262044100000007ni.zip', 'NACC840331_13122110752436602730000017080311514709200000001ni.zip', 'NACC478776_1

In [225]:
# Get the list of directories in the specified folder
dirs_in_folder = os.listdir(dirPath)

In [226]:
print(dirs_in_folder)

['mri129ni.zip', 'mri133ni.zip', 'mri136ni.zip', 'mri139ni.zip', 'mri140ni.zip', 'mri141ni.zip', 'mri143ni.zip', 'mri145ni.zip', 'mri147ni.zip', 'mri148ni.zip', 'mri149ni.zip', 'mri150ni.zip', 'mri151ni.zip', 'mri152ni.zip', 'mri154ni.zip', 'mri160ni.zip', 'mri167ni.zip', 'mri175ni.zip', 'mri178ni.zip', 'mri181ni.zip', 'mri1832ni.zip', 'mri1833ni.zip', 'mri1834ni.zip', 'mri1835ni.zip', 'mri1837ni.zip', 'mri1838ni.zip', 'mri1848ni.zip', 'mri1849ni.zip', 'mri1850ni.zip', 'mri1851ni.zip', 'mri1853ni.zip', 'mri1855ni.zip', 'mri1857ni.zip', 'mri1858ni.zip', 'mri1860ni.zip', 'mri1864ni.zip', 'mri1865ni.zip', 'mri1869ni.zip', 'mri1870ni.zip', 'mri187ni.zip', 'mri1900ni.zip', 'mri1901ni.zip', 'mri190ni.zip', 'mri1925ni.zip', 'mri192ni.zip', 'mri193ni.zip', 'mri1955ni.zip', 'mri1957ni.zip', 'mri1959ni.zip', 'mri1961ni.zip', 'mri196ni.zip', 'mri1971ni.zip', 'mri1972ni.zip', 'mri1973ni.zip', 'mri1974ni.zip', 'mri1975ni.zip', 'mri199ni.zip', 'mri201ni.zip', 'mri202ni.zip', 'mri203ni.zip', 'mri207n

In [227]:
common_filenames = set(dirs_in_folder).intersection(set(file_list))

print(len(common_filenames))

2112


In [228]:
# # Loop through the directories in the folder
# for dir_name in dirs_in_folder:

#     # Check if the directory name is NOT in the file list 
#     if dir_name not in file_list:
#         dir_to_remove = os.path.join(dirPath, dir_name)
#         print(f"Deleting directory: {dir_to_remove}")
#         os.remove(dir_to_remove)

### Deleting by hand
There is only a few files left I want to keep, so I can do that by hand.

In [229]:
folders_list = os.listdir(dirPath_extract)

print(folders_list)
print(len(folders_list))

['1018_NACC001341_20170307ni', '1018_NACC002424_20170509ni', '1018_NACC004873_20190826ni', '1018_NACC006454_20170727ni', '1018_NACC007445_20190430ni', '1018_NACC007737_20191014ni', '1018_NACC008987_20190211ni', '1018_NACC013585_20190311ni', '1018_NACC015412_20180315ni', '1018_NACC015790_20170223ni', '1018_NACC016272_20190301ni', '1018_NACC016727_20170814ni', '1018_NACC017548_20200114ni', '1018_NACC017560_20170516ni', '1018_NACC017560_20201001ni', '1018_NACC019767_20181113ni', '1018_NACC022572_20180522ni', '1018_NACC023248_20171215ni', '1018_NACC025694_20181025ni', '1018_NACC025902_20191021ni', '1018_NACC031396_20191204ni', '1018_NACC032047_20190610ni', '1018_NACC033103_20191001ni', '1018_NACC033752_20170515ni', '1018_NACC034369_20180604ni', '1018_NACC035668_20170929ni', '1018_NACC041825_20180802ni', '1018_NACC044374_20191030ni', '1018_NACC045315_20190204ni', '1018_NACC046449_20190703ni', '1018_NACC046497_20170220ni', '1018_NACC048048_20190406ni', '1018_NACC048134_20190819ni', '1018_NAC

In [230]:
dirs_in_folder = [dir.replace('.zip', '') for dir in dirs_in_folder]

print(dirs_in_folder)
print(len(dirs_in_folder))

['mri129ni', 'mri133ni', 'mri136ni', 'mri139ni', 'mri140ni', 'mri141ni', 'mri143ni', 'mri145ni', 'mri147ni', 'mri148ni', 'mri149ni', 'mri150ni', 'mri151ni', 'mri152ni', 'mri154ni', 'mri160ni', 'mri167ni', 'mri175ni', 'mri178ni', 'mri181ni', 'mri1832ni', 'mri1833ni', 'mri1834ni', 'mri1835ni', 'mri1837ni', 'mri1838ni', 'mri1848ni', 'mri1849ni', 'mri1850ni', 'mri1851ni', 'mri1853ni', 'mri1855ni', 'mri1857ni', 'mri1858ni', 'mri1860ni', 'mri1864ni', 'mri1865ni', 'mri1869ni', 'mri1870ni', 'mri187ni', 'mri1900ni', 'mri1901ni', 'mri190ni', 'mri1925ni', 'mri192ni', 'mri193ni', 'mri1955ni', 'mri1957ni', 'mri1959ni', 'mri1961ni', 'mri196ni', 'mri1971ni', 'mri1972ni', 'mri1973ni', 'mri1974ni', 'mri1975ni', 'mri199ni', 'mri201ni', 'mri202ni', 'mri203ni', 'mri207ni', 'mri210ni', 'mri212ni', 'mri214ni', 'mri216ni', 'mri219ni', 'mri220ni', 'mri223ni', 'mri224ni', 'mri226ni', 'mri229ni', 'mri233ni', 'mri235ni', 'mri236ni', 'mri239ni', 'mri240ni', 'mri241ni', 'mri242ni', 'mri246ni', 'mri249ni', 'mri2502

In [231]:
common_files = set(folders_list).intersection(set(dirs_in_folder))

print(common_files)
print(len(common_files))

set()
0


In [232]:
# for i in common_files:
#     shutil.rmtree(dirPath_extract + '/' + i)

In [233]:
file_list = [file_name.replace('.zip', '') for file_name in file_list]

In [234]:
print(file_list)

['mri3468ni', 'mri8604ni', 'NACC010716_13122110752436602730000015033012065479600000007ni', 'NACC010716_13122110752436602730000016050312073838400000001ni', 'NACC585174_13122110752436602730000015062212225349900000001ni', 'NACC585174_13122110752436602730000016110813524956900000010ni', 'NACC292215_13122110752436602730000015081912152981500000004ni', 'NACC292215_13122110752436602730000016110111382417400000004ni', 'NACC877188_13122110752436602730000015072412182163700000007ni', 'NACC877188_13122110752436602730000016113014261728200000007ni', 'NACC905491_13122110752436602730000015091112012170000000010ni', 'NACC905491_13122110752436602730000017092515563434600001543ni', 'NACC291711_13122110752436602730000015103011565935800000004ni', 'NACC291711_13122110752436602730000017111020485761400002570ni', 'NACC840331_13122110752436602730000015102912262044100000007ni', 'NACC840331_13122110752436602730000017080311514709200000001ni', 'NACC478776_13122110752436602730000016062112051245000000016ni', 'NACC478776_1

In [235]:
common_filenames = set(file_list).intersection(set(folders_list))

MOVE THESE COMMON FILENAMES FROM nifti_extract to nifti. Now nifti is the FINAL SET OF DATA THAT HOLDS ALL PATIENTS STORED IN THOSE 4 .CSV FILES (4 COHORTS).

In [236]:
print(common_filenames)
print(len(common_filenames))

{'1018_NACC282203_20201106ni', '1018_NACC711567_20201214ni', '1018_NACC822475_20201119ni', '1018_NACC862393_20191003ni', '1018_NACC822475_20171116ni', '1018_NACC838157_20200820ni', '1018_NACC356689_20171019ni', '1018_NACC282203_20170908ni', '1018_NACC711567_20200114ni', '1018_NACC450406_20210128ni', '1018_NACC356689_20201102ni', '1018_NACC450406_20180615ni', '1018_NACC862393_20161122ni', '1018_NACC838157_20170510ni'}
14
