In this notebook we build the final data to be analysed. 

In [1]:
import numpy as np
import pandas as pd
from os.path import join as opj
from scipy.io import loadmat
import h5py

In [2]:
data_dir = "path/to/feature/data"

In [3]:
f = h5py.File(opj("../data", "data_response_variables.hdf5"), "r")
YY_domain_cognition = f['Y_cognition'][:]
subjects = f['subjects'][:]
f.close()

In [13]:
connectome_features, surface_features, thickness_features, sub_vols_features = [], [], [], []

for sub_id in subjects:
    output_connectome = opj(data_dir, "final", "functional", "sub-" + str(sub_id), "connectome.txt")
    connectome = np.loadtxt(output_connectome)
    connectome_features.append(connectome[np.triu_indices_from(connectome, k=1)])
    
    output_surface = opj(data_dir, "final", "structural", "sub-" + str(sub_id), "surface.txt")
    surface_features.append(np.loadtxt(output_surface))
    
    output_thickness = opj(data_dir, "final", "structural", "sub-" + str(sub_id), "thickness.txt")
    thickness_features.append(np.loadtxt(output_thickness))
    
    output_sub_vols = opj(data_dir, "final", "structural", "sub-" + str(sub_id), "sub_volumes.txt")
    sub_vols_df = pd.read_csv(output_sub_vols, sep="\t")
    sub_vols_features.append(sub_vols_df.iloc[:,1].values)
    
connectome_features = np.asarray(connectome_features)
surface_features = np.asarray(surface_features)
thickness_features = np.asarray(thickness_features)
sub_vols_features = np.asarray(sub_vols_features )

In [15]:
# Here we load the local connectome data, previously downloaded from  
# http://dsi-studio.labsolver.org/download-images/local-connectome-fingerprints-of-hcp-1062-subjects-for-neofac-prediction
loc_conn_names = loadmat(opj(data_dir,"HCP1062_NEOFAC_fp.mat"), 
                         squeeze_me = True, variable_names = "names")['names']
loc_conn_dat = loadmat(opj(data_dir,"HCP1062_NEOFAC_fp.mat"), 
                       squeeze_me = True, variable_names = "subjects")['subjects']

In [16]:
loc_conn_features = []

for subj in subjects:

    loc_conn_features.append(loc_conn_dat[loc_conn_names == subj].flatten())

loc_conn_features = np.array(loc_conn_features)

In [17]:
print("The shape of connectome features matrix is ", connectome_features.shape)
print("The shape of surface features matrix is ", surface_features.shape)
print("The shape of thickness features matrix is ", thickness_features.shape)
print("The shape of subcortial volumes features matrix is ", sub_vols_features.shape)
print("The shape of local connectome features matrix is ", loc_conn_features.shape)
print("The shape of target matrix for predefined domain scores is ", YY_domain_cognition.shape)

The shape of connectome features matrix is  (1029, 257403)
The shape of surface features matrix is  (1029, 360)
The shape of thickness features matrix is  (1029, 360)
The shape of subcortial volumes features matrix is  (1029, 66)
The shape of local connectome features matrix is  (1029, 128894)
The shape of target matrix for predefined domain scores is  (1029, 7)


In [32]:
#Save these to disk

f =  h5py.File(opj(data_dir, "final_data.hdf5"), "w")

f.create_dataset("connectome_features", data=connectome_features)
f.create_dataset("surface_features", data=surface_features)
f.create_dataset("thickness_features", data=thickness_features)
f.create_dataset("sub_vols_features", data=sub_vols_features)
f.create_dataset("loc_conn_features", data=loc_conn_features)
f.create_dataset("YY_domain_cognition", data=YY_domain_cognition)
f.create_dataset("subjects", data=subjects)
f.close()