## Adding sequencing type to .csv files
In this script I will read all the .json files in my sorted cohorts, find out how many "different" types of FPSGR and MPRAGE sequencing I have and write it down in .csv files. 

In [17]:
import pandas as pd
import os
import json

Loading data from sorted cohorts.

In [9]:
ncPath = '../../NACC_data/sorted_cohorts/NC_simple/'
mciPath = '../../NACC_data/sorted_cohorts/MCI_simple/'
alzdPath = '../../NACC_data/sorted_cohorts/ALZD_simple/'
transPath = '../../NACC_data/sorted_cohorts/TRANS_simple/'

ncPath = os.path.abspath(ncPath)
mciPath = os.path.abspath(mciPath)
alzdPath = os.path.abspath(alzdPath)
transPath = os.path.abspath(transPath)

if os.name == 'nt':                   
    ncPath = '\\\\?\\' + ncPath
    mciPath = '\\\\?\\' + mciPath
    alzdPath = '\\\\?\\' + alzdPath
    transPath = '\\\\?\\' + transPath

Reading the filenames in sorted cohorts, reading the .json file and storing the sequencing info to a dictionary.

In [31]:
def store_to_dic(path):

    dic = {}                      # to store filenames and protocol name

    for folder in os.listdir(path):      # all folder in cohort

        folder_fp = os.path.join(path, folder)
        
        for files in os.listdir(folder_fp):     

            # extracting only .json files
            if files.endswith('.json'):

                json_fp = os.path.join(folder_fp, files)

                with open(json_fp, 'r') as f:
                    json_data = json.load(f)
            
                # reading protocol name
                protocol_name = json_data.get('ProtocolName', "")

                # saving data to dictionary
                dic['{}'.format(folder)] = '{}'.format(protocol_name)

    return dic

In [32]:
nc_dic = store_to_dic(ncPath)
mci_dic = store_to_dic(mciPath)
alzd_dic = store_to_dic(alzdPath)
trans_dic = store_to_dic(transPath)

Sanity check.

In [45]:
print(alzd_dic)
print(len(alzd_dic))

{'1018_NACC862393_20161122ni': 'MPRAGE_GRAPPA2', '1018_NACC862393_20191003ni': 'MPRAGE_GRAPPA2', 'mri5006ni': 'IR-FSPGR_TI=500_FA=10', 'mri5008ni': 'IR-FSPGR_TI=500_FA=10', 'mri5038ni': 'IR-FSPGR_TI=500_FA=10', 'mri5039ni': 'IR-FSPGR_TI=500_FA=10', 'mri5040ni': 'IR-FSPGR_TI=500_FA=10', 'mri5041ni': 'IR-FSPGR_TI=500_FA=10', 'mri5043ni': 'MP-RAGE', 'mri5044ni': 'MP-RAGE_REPEAT', 'mri5045ni': 'MP-RAGE', 'mri5046ni': 'MP-RAGE_REPEAT', 'mri5052ni': 'MP-RAGE', 'mri5053ni': 'MP-RAGE_REPEAT', 'mri5054ni': 'MP-RAGE', 'mri5055ni': 'MP-RAGE_REPEAT', 'mri5066ni': 'IR-FSPGR_TI=500_FA=10', 'mri5067ni': 'IR-FSPGR_TI=500_FA=10', 'mri5082ni': 'IR-FSPGR_TI=500_FA=10', 'mri5083ni': 'IR-FSPGR_TI=500_FA=10', 'mri5105ni': 'IR-FSPGR_TI=500_FA=10', 'mri5106ni': 'IR-FSPGR_TI=500_FA=10', 'mri5111ni': 'MP-RAGE', 'mri5116ni': 'IR-FSPGR_TI=500_FA=10', 'mri5170ni': 'IR-FSPGR_TI=500_FA=10', 'mri5171ni': 'IR-FSPGR_TI=500_FA=10', 'mri5197ni': 'IR-FSPGR_TI=500_FA=10', 'mri5198ni': 'IR-FSPGR_TI=500_FA=10', 'mri5199ni': 

Loading .csv files

In [57]:
loadPath = '../../NACC_data/sorted_cohorts/0csv/'
savePath = '../../NACC_data/sorted_cohorts/0csv/'

In [58]:
nc_df = pd.read_csv(loadPath + 'nc_v3.csv')
mci_df = pd.read_csv(loadPath + 'mci_v3.csv')
alzd_df = pd.read_csv(loadPath + 'alzd_v3.csv')
trans_df = pd.read_csv(loadPath + 'trans_v3.csv')

Writing the data to new .csv files. Just change the variables nc to mci for example.

In [71]:
nc_df['PTCLNAME'] = ''  # Empty column to be filled with the sequencing type
nc_df['MPRAGE'] = 0      # Default 0 for MPRAGE
nc_df['FSPGR'] = 0       # Default 0 for FSPGR

# Iterate over the rows and update the new columns based on the dictionary
for index, row in nc_df.iterrows():
    # Get the current NACCMRFI
    nacmrfi = row['NACCMRFI']
    
    # Check if NACCMRFI is in the dictionary
    if nacmrfi in nc_dic:

        ptclname = nc_dic[nacmrfi]
        
        # Set PTCLNAME
        nc_df.at[index, 'PTCLNAME'] = ptclname
        
        # Set MPRAGE (1 if 'MPRAGE' is in the sequencing type)
        if 'MPRAGE' in ptclname:
            nc_df.at[index, 'MPRAGE'] = 1
        
        # Set FSPGR (1 if 'FSPGR' is in the sequencing type)
        if 'FSPGR' in ptclname:
            nc_df.at[index, 'FSPGR'] = 1

In [72]:
# Reorder the columns
new_column_order = ['NACCID', 'NACCMNUM', 'MRIMO', 'MRIDY', 'MRIYR', 'NACCMRIA', 'NACCVNUM', 
                    'VISITMO', 'VISITDAY', 'VISITYR', 'NACCUDSD', 'NACCALZD', 'PTCLNAME', 
                    'MPRAGE', 'FSPGR', 'NACCMRFI']

nc_df = nc_df[new_column_order]

In [73]:
#nc_df.to_csv(savePath + 'nc_v4.csv', index=False)
#print("Updated DataFrame saved to 'nc_v4.csv'")

Updated DataFrame saved to 'nc_v4.csv'
