# Analysis of the Results File: L2_intensities Obtained via Tierpsy Tracker

Drive connection

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


Libraries

In [2]:
import h5py
import os
import pandas as pd
import numpy as np
import json

Inspecting file content

In [3]:
def inspect_hdf5_datasets(file_path):
    """
    Inspects an HDF5 file and prints the names and sizes of its datasets.

    Args:
        file_path (str): The path to the HDF5 file.
    """
    print(f"\n{'='*30}")
    print(f"Inspecting file: {file_path}")
    print(f"{'='*30}")
    try:
        if not os.path.exists(file_path):
            print(f"Error: The file '{file_path}' was not found.")
            return  # Exit the function if the file doesn't exist

        with h5py.File(file_path, 'r') as hdfid:
            datasets_info = {}
            for name, obj in hdfid.items():
                if isinstance(obj, h5py.Dataset):
                    datasets_info[name] = obj.size

            if datasets_info:
                print("Datasets found and their sizes:")
                for name, size in datasets_info.items():
                    print(f"  Dataset: {name}, Size: {size} elements")
            else:
                print("No top-level datasets were found in this file.")

    except Exception as e:
        print(f"An error occurred while processing the file '{file_path}': {e}")

# Define the file path you want to analyze here.  REPLACE THIS!
file_to_analyze = '/content/drive/MyDrive/Worms/Resultados/L2/L2_intensities/L2_intensities.hdf5'  # <--- REPLACE THIS LINE

inspect_hdf5_datasets(file_to_analyze)


Inspecting file: /content/drive/MyDrive/Worms/Resultados/L2/L2_intensities/L2_intensities.hdf5
Datasets found and their sizes:
  Dataset: straighten_worm_intensity_median, Size: 1349693 elements
  Dataset: trajectories_data_valid, Size: 10303 elements


Exporting the datasets to individuals CSV files

In [4]:
def export_hdf5_datasets_to_csv(file_path, output_dir):
    """
    Exports all datasets from an HDF5 file to individual CSV files.

    Args:
        file_path (str): The path to the HDF5 file.
        output_dir (str): The path to the directory where the CSV files will be saved.
    """
    print(f"\n{'='*30}")
    print(f"Exporting all datasets from: {file_path}")
    print(f"CSV files will be saved in: {output_dir}")
    print(f"{'='*30}")

    try:
        if not os.path.exists(file_path):
            print(f"Error: The file '{file_path}' was not found.")
            return

        # Create the output directory if it doesn't exist
        os.makedirs(output_dir, exist_ok=True)

        with h5py.File(file_path, 'r') as hdf_file:
            for dataset_name in hdf_file:
                if isinstance(hdf_file[dataset_name], h5py.Dataset):
                    print(f"\nProcessing dataset: {dataset_name}")
                    dataset = hdf_file[dataset_name]
                    data = dataset[:]  # Read all data

                    # Convert to Pandas DataFrame
                    df = pd.DataFrame(data)

                    # Construct the CSV file path
                    csv_file_path = os.path.join(output_dir, f"{dataset_name}.csv")

                    # Export to CSV
                    df.to_csv(csv_file_path, index=False)
                    print(f"Dataset '{dataset_name}' successfully exported to: {csv_file_path}")
                else:
                    print(f"Skipping: '{dataset_name}' is not a dataset.")

    except Exception as e:
        print(f"An error occurred: {e}")

    print("\nData export process complete.")



if __name__ == "__main__":
    # Example usage:  MODIFY THESE PATHS APPROPRIATELY
    hdf5_file_path = '/content/drive/MyDrive/Worms/Resultados/L2/L2_intensities/L2_intensities.hdf5'  # <--- Replace with your HDF5 file path
    csv_output_directory = '/content/drive/MyDrive/Worms/Resultados/L2/L2_intensities/Datasets'  # <--- Replace with the desired output folder

    export_hdf5_datasets_to_csv(hdf5_file_path, csv_output_directory)


Exporting all datasets from: /content/drive/MyDrive/Worms/Resultados/L2/L2_intensities/L2_intensities.hdf5
CSV files will be saved in: /content/drive/MyDrive/Worms/Resultados/L2/L2_intensities/Datasets
Skipping: 'provenance_tracking' is not a dataset.

Processing dataset: straighten_worm_intensity_median
Dataset 'straighten_worm_intensity_median' successfully exported to: /content/drive/MyDrive/Worms/Resultados/L2/L2_intensities/Datasets/straighten_worm_intensity_median.csv

Processing dataset: trajectories_data_valid
Dataset 'trajectories_data_valid' successfully exported to: /content/drive/MyDrive/Worms/Resultados/L2/L2_intensities/Datasets/trajectories_data_valid.csv

Data export process complete.


A problem arose because 'provenance_tracking' is not a dataset. It was  extracted separately:

In [5]:
# Note on non-dataset exports:
#
# The script exports standard HDF5 datasets to CSV files.  However, the following non-dataset item was handled specially:
#
# -  'provenance_tracking': This HDF5 group contains metadata attributes ('CLASS', 'TITLE', 'VERSION').
#    These attributes were extracted and written to a single-row CSV file.

In [6]:
def extract_provenance_tracking(file_path, output_dir):
    """
    Extracts the attributes from the 'provenance_tracking' group in an HDF5 file
    and saves them to a CSV file.

    Args:
        file_path (str): The path to the HDF5 file.
        output_dir (str): The directory where the CSV file will be saved.
    """
    print(f"\n{'='*30}")
    print(f"Extracting provenance_tracking from: {file_path}")
    print(f"CSV file will be saved in: {output_dir}")
    print(f"{'='*30}")

    try:
        if not os.path.exists(file_path):
            print(f"Error: The file '{file_path}' was not found.")
            return

        # Create the output directory if it doesn't exist
        os.makedirs(output_dir, exist_ok=True)

        with h5py.File(file_path, 'r') as hdf_file:
            if 'provenance_tracking' in hdf_file:
                provenance_group = hdf_file['provenance_tracking']
                attributes = {}
                for attr_name, attr_value in provenance_group.attrs.items():
                    attributes[attr_name] = attr_value
                df = pd.DataFrame([attributes])  # Create a DataFrame with a single row
                csv_file_path = os.path.join(output_dir, "provenance_tracking.csv")
                df.to_csv(csv_file_path, index=False)
                print(f"Provenance tracking data successfully exported to: {csv_file_path}")
            else:
                print(f"Warning: 'provenance_tracking' group not found in HDF5 file.")

    except Exception as e:
        print(f"An error occurred: {e}")

    print("\nProvenance tracking extraction process complete.")



if __name__ == "__main__":
    # Example usage:
    hdf5_file_path = '/content/drive/MyDrive/Worms/Resultados/L2/L2_intensities/L2_intensities.hdf5'  # <--- Replace with your HDF5 file path
    csv_output_directory = '/content/drive/MyDrive/Worms/Resultados/L2/L2_intensities/Groups'  # <--- Replace with the desired output folder

    extract_provenance_tracking(hdf5_file_path, csv_output_directory)


Extracting provenance_tracking from: /content/drive/MyDrive/Worms/Resultados/L2/L2_intensities/L2_intensities.hdf5
CSV file will be saved in: /content/drive/MyDrive/Worms/Resultados/L2/L2_intensities/Groups
Provenance tracking data successfully exported to: /content/drive/MyDrive/Worms/Resultados/L2/L2_intensities/Groups/provenance_tracking.csv

Provenance tracking extraction process complete.
