In [1]:
import h5py
import numpy as np
import pandas as pd
import pickle
import os
import tempfile
from meld_classifier.meld_cohort import MeldCohort, MeldSubject
from meld_classifier.paths import BASE_PATH

Setting MELD_DATA_PATH to /home/lennartw/wdir/other_projects/meld_classifier_orig/meld_classifier_v1.1.0/data
Setting BASE_PATH to /home/lennartw/wdir/other_projects/meld_classifier_orig/meld_classifier_v1.1.0/data/output/preprocessed_surf_data
Setting EXPERIMENT_PATH to /home/lennartw/wdir/other_projects/meld_classifier_orig/meld_classifier_v1.1.0/data/models
Setting FS_SUBJECTS_PATH to /home/lennartw/wdir/other_projects/meld_classifier_orig/meld_classifier_v1.1.0/data/fs_outputs
Setting BASE_PATH to /home/lennartw/wdir/other_projects/meld_classifier_orig/meld_classifier_v1.1.0/data/meld_params


In [2]:
def create_dataset_file(subjects, output_path):
    df=pd.DataFrame()
    subjects_id = [subject for subject in subjects]
    df['subject_id']=subjects_id
    df['split']=['test' for subject in subjects]
    df.to_csv(output_path)

### Create the subdataset 

In [3]:
dataset=None
    
# #Define subjects to include if you want to restrain the subjects list   
# subject_ids=[
#  'MELD_H101_3T_FCD_00120',
#  'MELD_H101_3T_FCD_00112',]

# #create a temporary csv file containing the subjects ids
# tmp = tempfile.NamedTemporaryFile(mode="w")
# create_dataset_file(subject_ids, tmp.name)

# dataset=tmp.name

#load cohort and subjects
site_codes=['H999']
cohort= MeldCohort(hdf5_file_root='{site_code}_{group}_featurematrix_smoothed.hdf5', dataset=dataset,
             data_dir=BASE_PATH)
listids = cohort.get_subject_ids(site_codes=site_codes,group='both', lesional_only=False)


In [4]:
listids

['MELD_H999_3T_FCD_00001',
 'MELD_H999_3T_FCD_00002',
 'MELD_H999_3T_FCD_00003',
 'MELD_H999_3T_FCD_00004',
 'MELD_H999_3T_FCD_00005',
 'MELD_H999_3T_FCD_00006',
 'MELD_H999_3T_FCD_00007',
 'MELD_H999_3T_FCD_00008',
 'MELD_H999_3T_FCD_00009',
 'MELD_H999_3T_FCD_00010',
 'MELD_H999_3T_FCD_00011',
 'MELD_H999_3T_FCD_00012',
 'MELD_H999_3T_FCD_00013',
 'MELD_H999_3T_FCD_00014',
 'MELD_H999_3T_FCD_00015',
 'MELD_H999_3T_FCD_00016',
 'MELD_H999_3T_FCD_00017',
 'MELD_H999_3T_FCD_00018',
 'MELD_H999_3T_FCD_00019',
 'MELD_H999_3T_FCD_00020',
 'MELD_H999_3T_FCD_00021',
 'MELD_H999_3T_FCD_00022',
 'MELD_H999_3T_FCD_00023',
 'MELD_H999_3T_FCD_00024',
 'MELD_H999_3T_FCD_00025',
 'MELD_H999_3T_FCD_00026',
 'MELD_H999_3T_FCD_00027',
 'MELD_H999_3T_FCD_00028',
 'MELD_H999_3T_FCD_00029',
 'MELD_H999_3T_FCD_00030',
 'MELD_H999_3T_FCD_00031',
 'MELD_H999_3T_FCD_00032',
 'MELD_H999_3T_FCD_00033',
 'MELD_H999_3T_FCD_00034',
 'MELD_H999_3T_FCD_00035',
 'MELD_H999_3T_FCD_00036',
 'MELD_H999_3T_FCD_00037',
 

### Extract features and save in pickle file

In [6]:
#add your saving directory here
output_dir='./data/output/temporary_files'


#features to save
features= [
           '.on_lh.curv.sm5.mgh']

#files to save the features in an anonymised matrix and send to me
npy_file_feature=os.path.join(output_dir, 'precombat_data_{}.pkl')
#files to save the list of ids (just for you, to keep track of the subjects' position in the anonymised matrix )
npy_file_ids=os.path.join(output_dir, 'listids_data_{}.pkl')

data_to_save={}
for feature_name in features:
    subjects_included=[]
    precombat_features = []
    #get precombat features for list of subjects
    for subject in listids:
        subj = MeldSubject(subject, cohort=cohort)
        if subj.has_features(feature_name):
            lh = subj.load_feature_values(feature_name, hemi="lh")[cohort.cortex_mask]
            rh = subj.load_feature_values(feature_name, hemi="rh")[cohort.cortex_mask]
            combined_hemis = np.hstack([lh, rh])
            precombat_features.append(combined_hemis)
            subjects_included.append(subject)
    #save precombat features in pickle file
    precombat_features = np.array(precombat_features)
    subjects_included = np.array(subjects_included)
    with open(npy_file_feature.format(feature_name), 'wb') as f:
        pickle.dump(precombat_features, f, pickle.HIGHEST_PROTOCOL)
    with open(npy_file_ids.format(feature_name), 'wb') as f:
        pickle.dump(subjects_included, f, pickle.HIGHEST_PROTOCOL)

In [7]:
with open('./data/output/temporary_files/listids_data_.on_lh.pial.K_filtered.sm20.mgh.pkl','rb') as f:
    data = pickle.load(f)
data

array(['MELD_H999_3T_FCD_00001', 'MELD_H999_3T_FCD_00002',
       'MELD_H999_3T_FCD_00003', 'MELD_H999_3T_FCD_00004',
       'MELD_H999_3T_FCD_00005', 'MELD_H999_3T_FCD_00006',
       'MELD_H999_3T_FCD_00007', 'MELD_H999_3T_FCD_00008',
       'MELD_H999_3T_FCD_00009', 'MELD_H999_3T_FCD_00010',
       'MELD_H999_3T_FCD_00011', 'MELD_H999_3T_FCD_00012',
       'MELD_H999_3T_FCD_00013', 'MELD_H999_3T_FCD_00014',
       'MELD_H999_3T_FCD_00015', 'MELD_H999_3T_FCD_00016',
       'MELD_H999_3T_FCD_00017', 'MELD_H999_3T_FCD_00018',
       'MELD_H999_3T_FCD_00019', 'MELD_H999_3T_FCD_00020',
       'MELD_H999_3T_FCD_00021', 'MELD_H999_3T_FCD_00022',
       'MELD_H999_3T_FCD_00023', 'MELD_H999_3T_FCD_00024',
       'MELD_H999_3T_FCD_00025', 'MELD_H999_3T_FCD_00026',
       'MELD_H999_3T_FCD_00027', 'MELD_H999_3T_FCD_00028',
       'MELD_H999_3T_FCD_00029', 'MELD_H999_3T_FCD_00030',
       'MELD_H999_3T_FCD_00031', 'MELD_H999_3T_FCD_00032',
       'MELD_H999_3T_FCD_00033', 'MELD_H999_3T_FCD_00034

In [6]:
with open('./data/output/temporary_files/listids_data_.on_lh.wm_FLAIR_1.sm10.mgh.pkl','rb') as f:
    data = pickle.load(f)
data

array(['MELD_H999_3T_FCD_00001', 'MELD_H999_3T_FCD_00002',
       'MELD_H999_3T_FCD_00003', 'MELD_H999_3T_FCD_00004',
       'MELD_H999_3T_FCD_00005', 'MELD_H999_3T_FCD_00006',
       'MELD_H999_3T_FCD_00007', 'MELD_H999_3T_FCD_00008',
       'MELD_H999_3T_FCD_00009', 'MELD_H999_3T_FCD_00010',
       'MELD_H999_3T_FCD_00011', 'MELD_H999_3T_FCD_00012',
       'MELD_H999_3T_FCD_00013', 'MELD_H999_3T_FCD_00014',
       'MELD_H999_3T_FCD_00015', 'MELD_H999_3T_FCD_00016',
       'MELD_H999_3T_FCD_00017', 'MELD_H999_3T_FCD_00018',
       'MELD_H999_3T_FCD_00019', 'MELD_H999_3T_FCD_00020',
       'MELD_H999_3T_FCD_00021', 'MELD_H999_3T_FCD_00022',
       'MELD_H999_3T_FCD_00023', 'MELD_H999_3T_FCD_00024',
       'MELD_H999_3T_FCD_00025', 'MELD_H999_3T_FCD_00026',
       'MELD_H999_3T_FCD_00027', 'MELD_H999_3T_FCD_00028',
       'MELD_H999_3T_FCD_00029', 'MELD_H999_3T_FCD_00030',
       'MELD_H999_3T_FCD_00031', 'MELD_H999_3T_FCD_00032',
       'MELD_H999_3T_FCD_00033', 'MELD_H999_3T_FCD_00034

In [11]:
listids[66]
weird_subs = [listids[66], listids[93], listids[110], listids[118], listids[119]]
#66,  93, 110, 118, 119
weird_subs

In [16]:
data_subs = pd.read_csv('/media/lennartw/lablab-disk/lennart_data/data_H998/list_subject_ids_all.csv', header=None)

In [14]:
weird_subs_h998 = [x.replace('H999','H998') for x in weird_subs]

In [26]:
[x in data_subs[0].values for x in weird_subs_h998]

[False, False, False, False, True]

In [25]:
'MELD_H998_3T_FCD_00043' in data_subs[0].values

True

In [27]:
weird_subs_h998

['MELD_H998_3T_FCD_00067',
 'MELD_H998_3T_FCD_00094',
 'MELD_H998_3T_FCD_00111',
 'MELD_H998_3T_FCD_00119',
 'MELD_H998_3T_FCD_00120']