**This notebook can be used to filter skeleton files from the entire set of skeleton files (in hdf5 format) such that the selected files are fit for eigen ciona calculation.**
- For eigen cionas, we pick skeleton files which have more than 5000 (see below) frames in them.
- Also, files which have no NaN rows are chosen. 
- The skeleton (coordinates) data from the selected hierarchical data files are saved as numpy array.

In [1]:
import os
import sys
import numpy as np

from joblib import Parallel, delayed


In [2]:
sys.path.append('../code/')
from data_handling import read_skeleton_as_array, calc_quality, del_nan_rows

In [3]:
skel_data_folder = "/share/data/longterm/2/Jerneja_Behaviour/Tierpsy_skeleton_files_jerneja/"
dest_file_folder = "/share/data/temp/athira/tierpsy_skeleton_files_for_eigen_npy/"

In [4]:
#dest_file_folder = "/share/data/temp/athira/tierpsy_skeleton_files_lightON_npy/"

# Getting the skeleton files

In [5]:
# Load the original hdf5 skeleton files 

skel_files = [os.path.join(root,name) for root,dirs,files in os.walk(skel_data_folder) 
              for name in files if name.endswith('skeletons.hdf5')]

print(f"Total number of hdf5 skeleton files:{len(skel_files)}")

Total number of hdf5 skeleton files:4286



# Pick skeletons for computing eigen ciona

In [6]:
def select_skeletons(file_path, feature_files):
    
    file_name = os.path.basename(file_path)
    file_name = file_name.split("_")[:-1]
    light_bool = file_name[6]
    date_time = f"{file_name[0]}_{file_name[1]}"
    fn_prefix = '_'.join(file_name)
    neck_point = None
    
    # TODO: find the neckpoint from feature files 

    for feat_file in feature_files:
        if date_time in feat_file:
            neck_point = feat_file.split('_')[0]
    
#     if neck_point in ['10n','11n','12n','13n','14n','15n']:
    if (neck_point in ['10n','11n','12n','13n','14n','15n']) : #& (light_bool != "None")
        
        dest_file_name = f"{neck_point}_{fn_prefix}_skeleton"
        dest_file_path = os.path.join(dest_file_folder, dest_file_name)

        skel_array = read_skeleton_as_array(file_path)
        skel_quality = calc_quality(skel_array)

        # choose skeletons with more than 5000 frames and no nan rows at all 
        if (len(skel_array) > 500) & (skel_quality ==100):
        
            skel_array = del_nan_rows(skel_array)
            np.save(dest_file_path, skel_array) ## save as npy files

            return 1
    else:
        return 0
        

In [7]:
# feature files are used as a temporary fix to get neckpoints for the skeletons
feature_files_folder = "/share/data/temp/daniel/20200211_tierpsy_features/"
(_,_,feature_files_JDA)  = next(os.walk(feature_files_folder))

# Parallel save the selected skeletons as pickles 
skel_pickles = Parallel(n_jobs=10, verbose = 5)(delayed(select_skeletons)(skel_file, feature_files_JDA) 
                                                for skel_file in skel_files)
print(f"Number of skeletons saved: {np.sum(skel_pickles==1)}")

[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done  52 tasks      | elapsed:    2.8s
[Parallel(n_jobs=10)]: Done 142 tasks      | elapsed:    5.7s
[Parallel(n_jobs=10)]: Done 378 tasks      | elapsed:   18.3s
[Parallel(n_jobs=10)]: Done 540 tasks      | elapsed:   27.4s
[Parallel(n_jobs=10)]: Done 738 tasks      | elapsed:   37.4s
[Parallel(n_jobs=10)]: Done 972 tasks      | elapsed:   48.0s
[Parallel(n_jobs=10)]: Done 1242 tasks      | elapsed:  1.0min
[Parallel(n_jobs=10)]: Done 1548 tasks      | elapsed:  1.2min
[Parallel(n_jobs=10)]: Done 2030 tasks      | elapsed:  1.5min
[Parallel(n_jobs=10)]: Done 2569 tasks      | elapsed:  1.8min
[Parallel(n_jobs=10)]: Done 2992 tasks      | elapsed:  2.1min
[Parallel(n_jobs=10)]: Done 3442 tasks      | elapsed:  2.4min
[Parallel(n_jobs=10)]: Done 4086 tasks      | elapsed:  2.8min
[Parallel(n_jobs=10)]: Done 4267 out of 4286 | elapsed:  3.0min remaining:    0.8s


Number of skeletons saved: 0


[Parallel(n_jobs=10)]: Done 4286 out of 4286 | elapsed:  3.0min finished


In [8]:
(root, dirs, filenames) = next(os.walk(dest_file_folder))
skels_selected = [fname for fname in filenames if fname.endswith('.npy')]
    
len(skels_selected)

252

In [9]:
print([len(np.load(os.path.join(dest_file_folder,skel_arr))) for skel_arr in skels_selected])

[9000, 9000, 9003, 17256, 9003, 9000, 9002, 9001, 9000, 6827, 8999, 1026, 9002, 9002, 2018, 9001, 8839, 9001, 9002, 8981, 8059, 9001, 9002, 9002, 8135, 8871, 9002, 4543, 8981, 9003, 9001, 7964, 6383, 4097, 9003, 8568, 9002, 9002, 9001, 8778, 9001, 9001, 8567, 9001, 26999, 9001, 9000, 9001, 8912, 27001, 9000, 9003, 8371, 9000, 3317, 26998, 9000, 8951, 8739, 9003, 8954, 8817, 9001, 9002, 8671, 2369, 9001, 9003, 22409, 9002, 8992, 8045, 3566, 9003, 9002, 9000, 9002, 8820, 9000, 9002, 9002, 9001, 7635, 9001, 8793, 9003, 9002, 9002, 9002, 8875, 9002, 9002, 9002, 9001, 27001, 9002, 9001, 9002, 9000, 8024, 9001, 9002, 9002, 6435, 9001, 7925, 27001, 13092, 27001, 7425, 8677, 8998, 9002, 26999, 21751, 8734, 9000, 8998, 8984, 8999, 9001, 9001, 4987, 8995, 9001, 2175, 2662, 9001, 9002, 9004, 8948, 9003, 3814, 9003, 7600, 9001, 6506, 9003, 8578, 8968, 9002, 9002, 8678, 9001, 9000, 9001, 7562, 7249, 26997, 9001, 5000, 9001, 9002, 7562, 9002, 8998, 9001, 9001, 8927, 8568, 9001, 9001, 8134, 9000, 801