# Analysis of the Results File: L2_skeletons Obtained via Tierpsy Tracker

Drive connection

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


Libraries

In [2]:
import h5py
import os
import pandas as pd
import numpy as np
import json

Inspecting file content

In [3]:
def inspect_hdf5_datasets(file_path):
    """
    Inspects an HDF5 file and prints the names and sizes of its datasets.

    Args:
        file_path (str): The path to the HDF5 file.
    """
    print(f"\n{'='*30}")
    print(f"Inspecting file: {file_path}")
    print(f"{'='*30}")
    try:
        if not os.path.exists(file_path):
            print(f"Error: The file '{file_path}' was not found.")
            return  # Exit the function if the file doesn't exist

        with h5py.File(file_path, 'r') as hdfid:
            datasets_info = {}
            for name, obj in hdfid.items():
                if isinstance(obj, h5py.Dataset):
                    datasets_info[name] = obj.size

            if datasets_info:
                print("Datasets found and their sizes:")
                for name, size in datasets_info.items():
                    print(f"  Dataset: {name}, Size: {size} elements")
            else:
                print("No top-level datasets were found in this file.")

    except Exception as e:
        print(f"An error occurred while processing the file '{file_path}': {e}")

# Define the file path you want to analyze here.  REPLACE THIS!
file_to_analyze = '/content/drive/MyDrive/Worms/Resultados/n2/n2_skeletons/n2_skeletons.hdf5'  # <--- REPLACE THIS LINE

inspect_hdf5_datasets(file_to_analyze)


Inspecting file: /content/drive/MyDrive/Worms/Resultados/n2/n2_skeletons/n2_skeletons.hdf5
Datasets found and their sizes:
  Dataset: blob_features, Size: 23324 elements
  Dataset: contour_area, Size: 23324 elements
  Dataset: contour_side1, Size: 2285752 elements
  Dataset: contour_side1_length, Size: 23324 elements
  Dataset: contour_side2, Size: 2285752 elements
  Dataset: contour_side2_length, Size: 23324 elements
  Dataset: contour_width, Size: 1142876 elements
  Dataset: plate_worms, Size: 24866 elements
  Dataset: skeleton, Size: 2285752 elements
  Dataset: skeleton_length, Size: 23324 elements
  Dataset: trajectories_data, Size: 23324 elements
  Dataset: width_midbody, Size: 23324 elements


Exporting the datasets to individuals CSV files

In [4]:
 # IMPORTANT NOTE:
# Some reshaping was needed to convert the datasets into a 2D array for its correct extraction
# Explanation:
# Pandas DataFrames, which are used to create CSV files, require 2-dimensional data.
# HDF5 datasets, however, can have any number of dimensions (1D, 2D, 3D, etc.).
# When a dataset has more than 2 dimensions, we need to reshape it into a 2D
# structure so that it can be stored in a DataFrame and then written to a CSV file.

In [8]:
def export_hdf5_datasets_to_csv(file_path, output_dir):
    """
    Exports all datasets from an HDF5 file (including those within groups)
    to individual CSV files. Handles datasets with more than 2 dimensions
    by reshaping them. Exports scalar datasets from 'provenance_tracking'
    to a separate CSV file.

    Args:
        file_path (str): The path to the HDF5 file.
        output_dir (str): The path to the directory where the CSV files will be saved.
    """
    print(f"\n{'='*30}")
    print(f"Exporting all datasets from: {file_path}")
    print(f"CSV files will be saved in: {output_dir}")
    print(f"{'='*30}")

    scalar_data = []

    try:
        if not os.path.exists(file_path):
            print(f"Error: The file '{file_path}' was not found.")
            return

        # Create the output directory if it doesn't exist
        os.makedirs(output_dir, exist_ok=True)

        def process_hdf5_item(name, obj):
            nonlocal scalar_data
            if isinstance(obj, h5py.Dataset):
                print(f"\nProcessing dataset: {name}")
                if len(obj.shape) > 0:  # Check if the dataset has dimensions
                    data = obj[:]  # Read all data

                    if len(data.shape) > 2:
                        # Reshape the data to 2D
                        original_shape = data.shape
                        new_shape = (original_shape[0], np.prod(original_shape[1:]))
                        data = data.reshape(new_shape)
                        print(f"  Reshaped dataset '{name}' from {original_shape} to {new_shape}")

                    # Convert the data to a Pandas DataFrame
                    df = pd.DataFrame(data)

                    # Construct the output CSV file path
                    csv_file_path = os.path.join(output_dir, f"{name.replace('/', '_')}.csv")
                    # Replace '/' with '_' in the filename to avoid directory issues

                    # Save the DataFrame to a CSV file
                    df.to_csv(csv_file_path, index=False)
                    print(f"  Dataset '{name}' successfully exported to: {csv_file_path}")
                elif obj.parent.name == '/provenance_tracking':
                    scalar_data.append({'name': name, 'value': obj[()]})
                    print(f"  Found scalar dataset in 'provenance_tracking': {name} = {obj[()]}")
                else:
                    print(f"  Skipping scalar dataset: {name} with shape {obj.shape}")
            elif isinstance(obj, h5py.Group):
                print(f"\nExploring group: {name}")
                # Recursively process items within the group
                obj.visititems(process_hdf5_item)
            else:
                print(f"  Skipping non-dataset/group: {name}")

        with h5py.File(file_path, 'r') as hdf_file:
            hdf_file.visititems(process_hdf5_item)

        # Save scalar data from 'provenance_tracking' to a CSV
        if scalar_data:
            scalar_df = pd.DataFrame(scalar_data)
            scalar_csv_path = os.path.join(output_dir, "provenance_tracking_scalars.csv")
            scalar_df.to_csv(scalar_csv_path, index=False)
            print(f"\nScalar datasets from 'provenance_tracking' exported to: {scalar_csv_path}")

    except Exception as e:
        print(f"An error occurred: {e}")

    print("\nData export process complete.")

if __name__ == "__main__":
    # Example usage:  MODIFY THESE PATHS APPROPRIATELY
    hdf5_file_path = '/content/drive/MyDrive/Worms/Resultados/n2/n2_skeletons/n2_skeletons.hdf5'  # <--- Replace with your HDF5 file path
    csv_output_directory = '/content/drive/MyDrive/Worms/Resultados/n2/n2_skeletons/Datasets'  # <--- Replace with the desired output folder

    export_hdf5_datasets_to_csv(hdf5_file_path, csv_output_directory)


Exporting all datasets from: /content/drive/MyDrive/Worms/Resultados/n2/n2_skeletons/n2_skeletons.hdf5
CSV files will be saved in: /content/drive/MyDrive/Worms/Resultados/n2/n2_skeletons/Datasets

Processing dataset: blob_features
  Dataset 'blob_features' successfully exported to: /content/drive/MyDrive/Worms/Resultados/n2/n2_skeletons/Datasets/blob_features.csv

Processing dataset: contour_area
  Dataset 'contour_area' successfully exported to: /content/drive/MyDrive/Worms/Resultados/n2/n2_skeletons/Datasets/contour_area.csv

Processing dataset: contour_side1
  Reshaped dataset 'contour_side1' from (23324, 49, 2) to (23324, np.int64(98))
  Dataset 'contour_side1' successfully exported to: /content/drive/MyDrive/Worms/Resultados/n2/n2_skeletons/Datasets/contour_side1.csv

Processing dataset: contour_side1_length
  Dataset 'contour_side1_length' successfully exported to: /content/drive/MyDrive/Worms/Resultados/n2/n2_skeletons/Datasets/contour_side1_length.csv

Processing dataset: cont

Visualization of each dataset

1. Top-Level Datasets

In [9]:
csv_output_directory = '/content/drive/MyDrive/Worms/Resultados/n2/n2_skeletons/Datasets'

print(f"\n{'='*30}")
print(f"Visualizing the first 5 rows of the datasets in: {csv_output_directory}")
print(f"{'='*30}")
try:
    # Check if the output directory exists
    if not os.path.exists(csv_output_directory):
        print(f"Error: The directory '{csv_output_directory}' was not found.")
        exit()

    # Iterate through all files in the specified directory
    for filename in os.listdir(csv_output_directory):
        if filename.endswith(".csv"):  # Check if the file is a CSV file
            file_path = os.path.join(csv_output_directory, filename)
            try:
                # Read the CSV file into a Pandas DataFrame
                df = pd.read_csv(file_path)

                print(f"\n--- File: {filename} ---")
                print("First 5 rows:")
                if not df.empty:
                    print(df.head())
                else:
                    print("The DataFrame is empty.")

            except pd.errors.EmptyDataError:
                print(f"Warning: The file '{filename}' is empty.")
            except Exception as e:
                print(f"Error reading the file '{filename}': {e}")
except Exception as e:
    print(f"An error occurred: {e}")

print("\nDataset visualization process completed.")


Visualizing the first 5 rows of the datasets in: /content/drive/MyDrive/Worms/Resultados/n2/n2_skeletons/Datasets

--- File: time.csv ---
First 5 rows:
    0
0 NaN
1 NaN
2 NaN
3 NaN
4 NaN

--- File: trajectories_data.csv ---
First 5 rows:
   frame_number  worm_index_joined  plate_worm_id  skeleton_id    coord_x  \
0             0                  2             16            0  1648.0538   
1             1                  2             18            1  1647.4797   
2             2                  2             20            2  1647.0740   
3             3                  2             22            3  1646.7949   
4             4                  2             24            4  1646.6013   

     coord_y  threshold  has_skeleton  roi_size   area  timestamp_raw  \
0  1663.1330      128.1             1      68.0  366.5            NaN   
1  1662.4630      128.1             1      68.0  366.5            NaN   
2  1661.9379      128.1             1      68.0  366.5            NaN   
3  16

2. Datasets extracted from the groups

In [10]:
csv_output_directory = '/content/drive/MyDrive/Worms/Resultados/n2/n2_skeletons/Groups'

print(f"\n{'='*30}")
print(f"Visualizing the first 5 rows of the datasets in: {csv_output_directory}")
print(f"{'='*30}")
try:
    # Check if the output directory exists
    if not os.path.exists(csv_output_directory):
        print(f"Error: The directory '{csv_output_directory}' was not found.")
        exit()

    # Iterate through all files in the specified directory
    for filename in os.listdir(csv_output_directory):
        if filename.endswith(".csv"):  # Check if the file is a CSV file
            file_path = os.path.join(csv_output_directory, filename)
            try:
                # Read the CSV file into a Pandas DataFrame
                df = pd.read_csv(file_path)

                print(f"\n--- File: {filename} ---")
                print("First 5 rows:")
                if not df.empty:
                    print(df.head())
                else:
                    print("The DataFrame is empty.")

            except pd.errors.EmptyDataError:
                print(f"Warning: The file '{filename}' is empty.")
            except Exception as e:
                print(f"Error reading the file '{filename}': {e}")
except Exception as e:
    print(f"An error occurred: {e}")

print("\nDataset visualization process completed.")


Visualizing the first 5 rows of the datasets in: /content/drive/MyDrive/Worms/Resultados/n2/n2_skeletons/Groups

--- File: timestamp_raw.csv ---
First 5 rows:
    0
0 NaN
1 NaN
2 NaN
3 NaN
4 NaN

--- File: timestamp_time.csv ---
First 5 rows:
    0
0 NaN
1 NaN
2 NaN
3 NaN
4 NaN

--- File: intensity_analysis_switched_head_tail.csv ---
First 5 rows:
   worm_index  ini_frame  last_frame
0           8        191         202
1           9          7          14
2         124        625         627
3         187        543         628
4         212        591         622

--- File: provenance_tracking_scalars.csv ---
First 5 rows:
             name                                              value
0      BLOB_FEATS  b'{"func_name": "getBlobsFeats", "func_argumen...
1  INT_SKE_ORIENT  b'{"func_name": "correctHeadTailIntensity", "f...
2      SKE_CREATE  b'{"func_name": "trajectories2Skeletons", "fun...
3        SKE_FILT  b'{"func_name": "getFilteredSkels", "func_argu...
4        SKE_INIT  b'