In [10]:
import pandas as pd
import matplotlib.pyplot as plt
import os
import numpy as np

scriptsDir = '/projects/f_mc1689_1/ReliableFC/docs/scripts'
completenessFile = f'/projects/f_mc1689_1/AgingHubs/docs/scripts/HCA_LS_2.0_subject_completeness.csv'
subjectsFile = f'{scriptsDir}/subjects.tsv'

excludeSubjs = []

completenessDF = pd.read_csv(completenessFile,sep=',',header=0,index_col=0)
#completenessDF = completenessDF[completenessDF['Full_MR_Compl'] == '0']
completenessDF = completenessDF[completenessDF['T1_Count'] == '1']
completenessDF = completenessDF[completenessDF['T2_Count'] == '1']
completenessDF = completenessDF[completenessDF['RS-fMRI_PctCompl'] == '100']
completenessDF = completenessDF[completenessDF['dMRI_Compl'] == '1']
completenessDF = completenessDF[completenessDF['tMRI_CARIT_PctCompl'] == '100']
#completenessDF = completenessDF[completenessDF['tMRI_FACENAME_PctCompl'] == '100']
#completenessDF = completenessDF[completenessDF['tMRI_VISMOTOR_PctCompl'] == '100']
completenessDF = completenessDF[pd.isnull(completenessDF['QC_Issue_Codes'])]
completenessDF = completenessDF.drop(index=excludeSubjs)

measuresDF = pd.read_csv(f'{scriptsDir}/indivDiffMeasures.tsv',sep='\t',index_col='measures')
for measure in measuresDF['filename']:
    meas = measure.split('.txt')[0]
    completenessDF = completenessDF[completenessDF[meas] == '1']

ageMonths = completenessDF['interview_age'].values
ageYears = np.around([int(i)/12 for i in ageMonths],4)
completenessDF['ageYears'] = ageYears

subjectsDF = pd.DataFrame({'age': completenessDF['ageYears'], 'sex': completenessDF['sex'], 'dataset': np.full((len(completenessDF)),'')})
print(len(subjectsDF.index))
print(subjectsDF)


482
                     age sex dataset
src_subject_id                      
HCA6002236       46.5000   F        
HCA6018857       36.3333   F        
HCA6030645       45.3333   F        
HCA6047359       53.3333   M        
HCA6051047       60.4167   F        
...                  ...  ..     ...
HCA9938309      100.0000   F        
HCA9938814       46.9167   F        
HCA9947411       38.2500   M        
HCA9956008       41.0000   F        
HCA9992517       54.0000   F        

[482 rows x 3 columns]


In [11]:
indivDiffs = {}
subjList = subjectsDF.index

for measure in measuresDF.index:
    indivDiffs[measure] = np.zeros((len(subjList)))
    file = f'/projects/f_mc1689_1/ReliableFC/data/downloads/{measuresDF.loc[measure,"filename"]}'
    col = measuresDF.loc[measure,"colname"]
    tmpDF = pd.read_csv(file,sep='\t')
    for s,subj in enumerate(subjList):
        tmp = tmpDF.loc[tmpDF["src_subject_id"]==subj,col].values
        if len(tmp)==0:
            tmp = np.nan
        elif len(tmp) > 1:
            tmp = tmp.astype(float)
            tmp = tmp[~np.isnan(tmp)]
            tmp = tmp[tmp!=999]
            if len(tmp)==0:
                tmp = np.nan
        else:
            tmp = float(tmp)
        indivDiffs[measure][s] = tmp

indivDiffsDF = pd.DataFrame(data=indivDiffs,index=subjList)
subjsWithBeh = np.full((len(indivDiffsDF.index)),True)
for measure in indivDiffsDF.columns:
    subjsWithBeh = subjsWithBeh & indivDiffsDF[measure].notna().values
    
subjectsDF = subjectsDF[subjsWithBeh]
print(subjectsDF.shape)
subjectsDF.to_csv(path_or_buf=subjectsFile, sep='\t', index_label='subjects')


(472, 3)


In [13]:
subjectsDF.loc[(np.arange(len(subjectsDF))%2)==0,'dataset'] = 'discovery'
subjectsDF.loc[(np.arange(len(subjectsDF))%2)==1,'dataset'] = 'replication'

discDF = subjectsDF.loc[subjectsDF['dataset']=='discovery']
repDF = subjectsDF.loc[subjectsDF['dataset']=='replication']

subjectsFileDisc = f'{scriptsDir}/subjects_discovery.tsv'
subjectsFileRep = f'{scriptsDir}/subjects_replication.tsv'

discDF.to_csv(path_or_buf=subjectsFileDisc, sep='\t', index_label='subjects')
repDF.to_csv(path_or_buf=subjectsFileRep, sep='\t', index_label='subjects')

print(discDF)
print()
print('Discovery:')
print(f"{np.sum(discDF['sex'] == 'F')} female subjects")
print(f"Mean age = {np.mean(discDF['age']):.2f} (SD = {np.std(discDF['age']):.2f})")
print()
print('Replication:')
print(f"{np.sum(repDF['sex'] == 'F')} female subjects")
print(f"Mean age = {np.mean(repDF['age']):.2f} (SD = {np.std(repDF['age']):.2f})")


                    age sex    dataset
src_subject_id                        
HCA6002236      46.5000   F  discovery
HCA6030645      45.3333   F  discovery
HCA6051047      60.4167   F  discovery
HCA6054457      62.5000   M  discovery
HCA6075263      63.9167   M  discovery
...                 ...  ..        ...
HCA9912391      49.6667   M  discovery
HCA9913090      79.2500   M  discovery
HCA9926201      76.2500   M  discovery
HCA9938814      46.9167   F  discovery
HCA9956008      41.0000   F  discovery

[236 rows x 3 columns]

Discovery:
141 female subjects
Mean age = 56.89 (SD = 13.95)

Replication:
134 female subjects
Mean age = 58.23 (SD = 14.35)
