# Load libraries

In [1]:
import os
import h5py
import pickle
import numpy as np


In [2]:
# load subjects
subjNums = [ 'sub-3', 'sub-4', 'sub-5', 'sub-6', 'sub-7', 'sub-8', 'sub-10', 
         'sub-11', 'sub-12', 'sub-13', 'sub-14', 'sub-15', 'sub-16', 'sub-17', 'sub-18', 'sub-19', 'sub-20',
           'sub-21', 'sub-22', 'sub-23', 'sub-24', 'sub-25', 'sub-26', 'sub-28', 'sub-29', 'sub-30',
          'sub-31', 'sub-32', 'sub-33', 'sub-34', 'sub-35', 'sub-36', 'sub-37', 'sub-38', 'sub-39', 'sub-40']

In [3]:
# load raw data directory
actual_data_dir = '/projects/f_mc1689_1/MeiranNext/data/results/ArunResults/ActualBetasParcelwise/'
actual_suffix = '_actualBetas.pkl'   

# specify output directory
output_dir = '/projects/f_mc1689_1/MeiranNext/data/results/ArunResults/ActualBetasParcelwiseH5/'

# specify all task runs
allRuns = ['test1_901', 'test2_1101', 'test3_1301', 'test4_1501', 
           'test5_1701', 'test6_1901', 'test7_2101', 'test8_2301']

In [4]:
# code to load raw pickle files
def load_and_print_pickle(file_path):
    with open(file_path, 'rb') as file:
        data = pickle.load(file)
        return data

# load sample pickle file

In [6]:
# loading a sample pickle file 
sample_path = '/projects/f_mc1689_1/MeiranNext/data/results/ArunResults/ActualBetasParcelwise/sub-10/sub-10_test5_1701_actualBetas.pkl'
data = load_and_print_pickle(sample_path)
data[0]

{'miniblock': 1,
 'trials': [{'condition': 'Novel_Instruction',
   'betas': array([ 1.07548233e-01, -1.81739225e-02, -2.90498607e-02,  3.25399030e-02,
           1.68178885e-02,  1.11743333e-01,  1.72757497e-01, -5.19398706e-02,
          -6.56979676e-02,  5.90985557e-02,  3.32330225e-02, -4.19406177e-02,
           6.91775455e-02, -6.34257110e-02,  8.41482740e-02,  9.94059181e-02,
           1.02750245e-01,  9.89624814e-02,  1.15529472e-01,  1.74665450e-01,
           1.03973446e-01,  1.80833927e-01,  8.92829597e-02, -5.05446912e-02,
          -5.48150017e-02,  2.30884243e-03, -2.26701690e-02,  6.66618887e-02,
           1.39794195e-02, -8.83532487e-03,  4.91605369e-02, -3.21544535e-02,
           3.26922518e-02, -7.47197479e-02, -6.92747336e-02, -5.97998693e-02,
          -4.52350049e-02, -3.85209961e-02, -2.70557345e-02,  3.05329036e-03,
          -4.48683696e-02,  3.45936400e-02, -5.02066308e-02, -2.74357852e-02,
          -3.63847635e-02,  3.47517206e-02, -5.34426102e-02,  2.82966

# convert pickle to h5 

In [17]:
def preprocess(subj):
    """ Uses run-wise pickle files of a subject
    and converts them to a single .h5 file     
    """
    
    #specify subject's output path
    output_path = os.path.join(output_dir, subj + '_actualBetas.h5')

    with h5py.File(output_path, 'w') as h5f:
        for run in allRuns:
            run_path = os.path.join(actual_data_dir, subj, subj + '_' + run + actual_suffix)
            try:
                run_data = load_and_print_pickle(run_path)
            except FileNotFoundError:
                print(f'File not found: {run_path}')
                continue
            run_group = h5f.create_group(run)
            for miniblock in run_data:
                mb_num = miniblock['miniblock']
                mb_group = run_group.create_group(f'miniblock_{mb_num}')
                
                trials = miniblock['trials']
                conditions = [trial['condition'] for trial in trials]
                betas = np.stack([trial['betas'] for trial in trials])
                
                mb_group.create_dataset('condition', data=np.array(conditions, dtype='S'))
                mb_group.create_dataset('betas', data=betas)

                # Handle motor_response if it exists in any trial
                if any('motor_response' in trial for trial in trials):
                    motor_responses = [trial.get('motor_response', '') for trial in trials]
                    mb_group.create_dataset('motor_response', data=np.array(motor_responses, dtype='S'))

                # Handle Stimuli if it exists in any trial
                if any('Stimuli' in trial for trial in trials):
                    stimuli_vals = [trial.get('Stimuli', np.nan) for trial in trials]
                    mb_group.create_dataset('Stimuli', data=np.array(stimuli_vals))


In [18]:
# run the function for all subjects
for subj in subjNums:
    preprocess(subj)

File not found: /projects/f_mc1689_1/MeiranNext/data/results/ArunResults/ActualBetasParcelwise/sub-16/sub-16_test7_2101_actualBetas.pkl
File not found: /projects/f_mc1689_1/MeiranNext/data/results/ArunResults/ActualBetasParcelwise/sub-21/sub-21_test4_1501_actualBetas.pkl
File not found: /projects/f_mc1689_1/MeiranNext/data/results/ArunResults/ActualBetasParcelwise/sub-29/sub-29_test5_1701_actualBetas.pkl
File not found: /projects/f_mc1689_1/MeiranNext/data/results/ArunResults/ActualBetasParcelwise/sub-31/sub-31_test7_2101_actualBetas.pkl
File not found: /projects/f_mc1689_1/MeiranNext/data/results/ArunResults/ActualBetasParcelwise/sub-32/sub-32_test2_1101_actualBetas.pkl
File not found: /projects/f_mc1689_1/MeiranNext/data/results/ArunResults/ActualBetasParcelwise/sub-35/sub-35_test3_1301_actualBetas.pkl
File not found: /projects/f_mc1689_1/MeiranNext/data/results/ArunResults/ActualBetasParcelwise/sub-38/sub-38_test5_1701_actualBetas.pkl


# load sample h5 file

In [7]:
# sample .h5 file path
sample_file = "/projects/f_mc1689_1/MeiranNext/data/results/ArunResults/ActualBetasParcelwiseH5/sub-10_actualBetas.h5"
h5f = h5py.File(sample_file,'r')
list(h5f.keys())

In [10]:
list(h5f['test1_901'])

['miniblock_1',
 'miniblock_10',
 'miniblock_11',
 'miniblock_12',
 'miniblock_2',
 'miniblock_3',
 'miniblock_4',
 'miniblock_5',
 'miniblock_6',
 'miniblock_7',
 'miniblock_8',
 'miniblock_9']

In [11]:
list(h5f['test5_1701']['miniblock_10'])  

['Stimuli', 'betas', 'condition', 'motor_response']

In [12]:
list(h5f['test5_1701']['miniblock_10']['condition'])

[b'Novel_Instruction',
 b'Novel_Correct_GO',
 b'Novel_Correct_GO',
 b'Novel_Correct_GO',
 b'Novel_Correct_GO']