Generate Connectome from Niftis
====

This notebook generates a connectome set for use with connectome_quick from 4d Nifti files

In [None]:
import os 
import glob
import multiprocessing
import nibabel as nib
import numpy as np
from nilearn import image, maskers, surface
from tqdm import tqdm 
from natsort import natsorted
from nimlab import datasets
import matplotlib.pyplot as plt


# Don't be a pest when multiprocessing
os.nice(19)

# Function that transforms a set of nifti files to an npy file and a norms file for use with connectome_quick
# args is a tuple in the form (lh_files[], rh_files[], subject_name, lh_mask_file, rh_mask_file, output_dir, gsr=False)
def transform(args):
    lh_files = args[0]
    rh_files = args[1]
    subject_name = args[2]
    lh_mask_file = args[3]
    rh_mask_file = args[4]
    output_dir = args[5]
    global_signal_regression = args[6]
    
    lh_subject_img = np.concatenate([image.get_data(f) for f in lh_files[:2]], axis=3)
    rh_subject_img = np.concatenate([image.get_data(f) for f in rh_files[:2]], axis=3)
    
    lh_mask = nib.load(lh_mask_file).agg_data().astype(bool)
    rh_mask = nib.load(rh_mask_file).agg_data().astype(bool)
    
    if lh_subject_img.shape[-1] == rh_subject_img.shape[-1]:
        timepoints = lh_subject_img.shape[-1]
    else:
        raise ValueError("Differing number of time points")

    lh_subject_img = lh_subject_img.reshape((10242, timepoints), order="F")
    rh_subject_img = rh_subject_img.reshape((10242, timepoints), order="F")
    masked = np.transpose(np.concatenate((lh_subject_img[lh_mask,:], rh_subject_img[rh_mask,:])))

    # "global" signal regression - remove average global signal from each voxel timecourse
    if global_signal_regression:
        global_signal = np.mean(masked, axis=0)
        masked = masked - global_signal
        
    norms = np.linalg.norm(masked, axis = 0)
    np.save(os.path.join(output_dir,subject_name),masked.astype('float16'))
    np.save(os.path.join(output_dir,subject_name+'_norms'),norms.astype('float16'))

## 1. Gather the files you want to convert from nifti to numpy arrays and specify where you want the output to go

In [None]:
# Get the files to be transformed, you may need wildcards, two examples provided:
# files = natsorted(glob.glob("/lab-share/Neuro-Cohen-e2/Public/projects/GSP/CBIG_fMRI_preprocess_Legacy_GSP_500M/sub*/vol/sub*gz"))
# files = natsorted(glob.glob("/data/nimlab/Yeo_1000_nii/vol/*"))

input_dir = '/data/nimlab/connectomes/fMRI/GSP1000_MF'
output_dir = '/data/nimlab/connectome_npy/GSP1000_MF_surf_fs5_GSR'

# All NIMLAB analyses have used the FSL 2mm_brain_mask_dil file since shifting to python code (older analyses used the 222.nii.gz mask as in Lead-DBS)
lh_mask_img = datasets.get_img_path("fs5_mask_lh")
rh_mask_img = datasets.get_img_path("fs5_mask_rh")
# lh_mask_img = os.path.abspath("OLD_fs5_mask_lh.gii")
# rh_mask_img = os.path.abspath("OLD_fs5_mask_rh.gii")

lh_files = natsorted(glob.glob(os.path.join(input_dir,"sub-*/surf/lh.*fs5.nii.gz")))
rh_files = natsorted(glob.glob(os.path.join(input_dir,"sub-*/surf/rh.*fs5.nii.gz")))

if not os.path.exists(output_dir):
    os.makedirs(output_dir)
    
if len(lh_files) != len(rh_files):
    raise ValueError("Some subjects are missing hemispheres")
    
print(lh_files[:5])
print(rh_files[:5])

## 2. Generate a list of unique subject names (which we will then use to combine runs within subjects)

In [None]:
# Get unique subject names from dataset. This cell requires the user to change it based on the naming scheme of the files
# For instance, for files named: sub-0003_bld002_rest_skip4_stc_mc_bp_0.0001_0.08_resid_FS1mm_MNI1mm_MNI2mm_sm7_finalmask.nii.gz
# sub = f.split('/')[-1].split('_bld')[0]   will split the files using '_bld' and take what's before it to be the subject name

subjects = []
for lh, rh in zip(lh_files, rh_files):
    sub = lh.split('/')[-1].split('_bld')[0].split("lh.")[-1]
    subjects.append(sub)
    sub = rh.split('/')[-1].split('_bld')[0].split("rh.")[-1]
    subjects.append(sub)
unique_subjects = list(set(subjects))
print(len(unique_subjects))

## 3. Makes a list of runs for each subject and set-up the arguments for conversion

In [None]:
# The transform function expects lists of runs that belong to the same subject so that they can be concatenated. 
# Since every dataset may have different naming conventions, you will have to test this yourself. 's' is the subject name.

global_signal_regression = False

subject_args = []
for s in unique_subjects:
    lh_runs = natsorted(glob.glob(os.path.join(input_dir,s+'*','surf','lh.'+s+'*fs5.nii.gz')))
    rh_runs = natsorted(glob.glob(os.path.join(input_dir,s+'*','surf','rh.'+s+'*fs5.nii.gz')))
    subject_args.append((lh_runs, rh_runs, s, lh_mask_img, rh_mask_img, output_dir, global_signal_regression))

# show the results for the first five subjects
print(subject_args[:5])
                     
# make sure we got everyone
number_of_subjects = len(subject_args)
print(number_of_subjects)

## 4. Run the conversion in serial or parallel as able for your computer (you want processes <= the number of cores you have access to)

In [None]:
# Convert it!

## One at a time (but without depending on multiprocessing.Pool)

for i in tqdm(subject_args):
    transform(i)

    
## Run all at once (as long as your python environment is set up correctly)

# pool = multiprocessing.Pool(processes=80)
# list(tqdm(pool.imap(transform, subject_args), total=number_of_subjects))
# pool.close()
# pool.join()