# Analysis of the Results File: n2_featuresN Obtained via Tierpsy Tracker

Drive connection

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


Libraries

In [2]:
import h5py
import os
import pandas as pd
import numpy as np
import json

Inspecting file content

In [3]:
def inspect_hdf5_datasets(file_path):
    """
    Inspects an HDF5 file and prints the names and sizes of its datasets.

    Args:
        file_path (str): The path to the HDF5 file.
    """
    print(f"\n{'='*30}")
    print(f"Inspecting file: {file_path}")
    print(f"{'='*30}")
    try:
        if not os.path.exists(file_path):
            print(f"Error: The file '{file_path}' was not found.")
            return  # Exit the function if the file doesn't exist

        with h5py.File(file_path, 'r') as hdfid:
            datasets_info = {}
            for name, obj in hdfid.items():
                if isinstance(obj, h5py.Dataset):
                    datasets_info[name] = obj.size

            if datasets_info:
                print("Datasets found and their sizes:")
                for name, size in datasets_info.items():
                    print(f"  Dataset: {name}, Size: {size} elements")
            else:
                print("No top-level datasets were found in this file.")

    except Exception as e:
        print(f"An error occurred while processing the file '{file_path}': {e}")

# Define the file path you want to analyze here.  REPLACE THIS!
file_to_analyze = '/content/drive/MyDrive/Worms/Resultados/n2/n2_featuresN/n2_featuresN.hdf5'  # <--- REPLACE THIS LINE

inspect_hdf5_datasets(file_to_analyze)


Inspecting file: /content/drive/MyDrive/Worms/Resultados/n2/n2_featuresN/n2_featuresN.hdf5
Datasets found and their sizes:
  Dataset: blob_features, Size: 23324 elements
  Dataset: features_stats, Size: 4539 elements
  Dataset: timeseries_data, Size: 23324 elements
  Dataset: trajectories_data, Size: 23324 elements


Exporting the datasets to individuals CSV files

In [4]:
def export_hdf5_datasets_to_csv(file_path, output_dir):
    """
    Exports all datasets from an HDF5 file to individual CSV files.

    Args:
        file_path (str): The path to the HDF5 file.
        output_dir (str): The path to the directory where the CSV files will be saved.
    """
    print(f"\n{'='*30}")
    print(f"Exporting all datasets from: {file_path}")
    print(f"CSV files will be saved in: {output_dir}")
    print(f"{'='*30}")

    try:
        if not os.path.exists(file_path):
            print(f"Error: The file '{file_path}' was not found.")
            return

        # Create the output directory if it doesn't exist
        os.makedirs(output_dir, exist_ok=True)

        with h5py.File(file_path, 'r') as hdf_file:
            for dataset_name in hdf_file:
                if isinstance(hdf_file[dataset_name], h5py.Dataset):
                    print(f"\nProcessing dataset: {dataset_name}")
                    dataset = hdf_file[dataset_name]
                    data = dataset[:]  # Read all data

                    # Convert to Pandas DataFrame
                    df = pd.DataFrame(data)

                    # Construct the CSV file path
                    csv_file_path = os.path.join(output_dir, f"{dataset_name}.csv")

                    # Export to CSV
                    df.to_csv(csv_file_path, index=False)
                    print(f"Dataset '{dataset_name}' successfully exported to: {csv_file_path}")
                else:
                    print(f"Skipping: '{dataset_name}' is not a dataset.")

    except Exception as e:
        print(f"An error occurred: {e}")

    print("\nData export process complete.")



if __name__ == "__main__":
    # Example usage:  MODIFY THESE PATHS APPROPRIATELY
    hdf5_file_path = '/content/drive/MyDrive/Worms/Resultados/n2/n2_featuresN/n2_featuresN.hdf5'  # <--- Replace with your HDF5 file path
    csv_output_directory = '/content/drive/MyDrive/Worms/Resultados/n2/n2_featuresN/Datasets'  # <--- Replace with the desired output folder

    export_hdf5_datasets_to_csv(hdf5_file_path, csv_output_directory)


Exporting all datasets from: /content/drive/MyDrive/Worms/Resultados/n2/n2_featuresN/n2_featuresN.hdf5
CSV files will be saved in: /content/drive/MyDrive/Worms/Resultados/n2/n2_featuresN/Datasets

Processing dataset: blob_features
Dataset 'blob_features' successfully exported to: /content/drive/MyDrive/Worms/Resultados/n2/n2_featuresN/Datasets/blob_features.csv
Skipping: 'coordinates' is not a dataset.

Processing dataset: features_stats
Dataset 'features_stats' successfully exported to: /content/drive/MyDrive/Worms/Resultados/n2/n2_featuresN/Datasets/features_stats.csv
Skipping: 'provenance_tracking' is not a dataset.

Processing dataset: timeseries_data
Dataset 'timeseries_data' successfully exported to: /content/drive/MyDrive/Worms/Resultados/n2/n2_featuresN/Datasets/timeseries_data.csv

Processing dataset: trajectories_data
Dataset 'trajectories_data' successfully exported to: /content/drive/MyDrive/Worms/Resultados/n2/n2_featuresN/Datasets/trajectories_data.csv

Data export proce

A problem arose because 'coordinates' and 'provenance_tracking' were not datasets. They were extracted separately:

In [5]:
# Note on non-dataset exports:
#
# The script exports standard HDF5 datasets to CSV files.  However, the following non-dataset items were handled specially:
#
# -  'coordinates': This HDF5 group contains 3D datasets ('dorsal_contours', 'skeletons', 'ventral_contours').
#    These datasets were extracted and stored as JSON strings within individual CSV files to preserve their 3D structure.
#
# -  'provenance_tracking': This HDF5 group contains metadata attributes ('CLASS', 'TITLE', 'VERSION').
#    These attributes were extracted and written to a single-row CSV file.

1. Coordinates extraction

In [6]:
def extract_coordinates(file_path, output_dir):
    """
    Extracts the 'dorsal_contours', 'skeletons', and 'ventral_contours' datasets from an HDF5 file,
    handling their 3D structure by storing them as JSON strings in a CSV file.

    Args:
        file_path (str): The path to the HDF5 file.
        output_dir (str): The path to the directory where the CSV files will be saved.
    """
    print(f"\n{'='*30}")
    print(f"Extracting coordinates from: {file_path}")
    print(f"CSV files will be saved in: {output_dir}")
    print(f"{'='*30}")

    try:
        if not os.path.exists(file_path):
            print(f"Error: The file '{file_path}' was not found.")
            return

        # Create the output directory if it doesn't exist
        os.makedirs(output_dir, exist_ok=True)

        with h5py.File(file_path, 'r') as hdf_file:
            if 'coordinates' in hdf_file:
                coordinates_group = hdf_file['coordinates']
                for dataset_name in ['dorsal_contours', 'skeletons', 'ventral_contours']:
                    if dataset_name in coordinates_group:
                        print(f"\nProcessing dataset: {dataset_name}")
                        data = coordinates_group[dataset_name][:]  # Read all data

                        if data.ndim > 2:
                            # Store 3D coordinates as JSON strings in CSV
                            print(f"  Storing 3D coordinates '{dataset_name}' as JSON strings in CSV.")
                            # Convert each 2D slice to a JSON string
                            json_data = [json.dumps(slice_2d.tolist()) for slice_2d in data]
                            df = pd.DataFrame({dataset_name: json_data})  # Store in a DataFrame
                            csv_file_path = os.path.join(output_dir, f"{dataset_name}.csv")
                            df.to_csv(csv_file_path, index=False)
                            print(f"  Dataset '{dataset_name}' successfully exported to: {csv_file_path}")
                        else:
                            print(f"  Warning: Dataset '{dataset_name}' is not 3D. Skipping.")
                    else:
                        print(f"  Warning: Dataset '{dataset_name}' not found in 'coordinates' group.")
            else:
                print(f"  Warning: 'coordinates' group not found in HDF5 file.")

    except Exception as e:
        print(f"An error occurred: {e}")

    print("\nCoordinate extraction process complete.")



if __name__ == "__main__":
    # Example usage:
    hdf5_file_path = '/content/drive/MyDrive/Worms/Resultados/n2/n2_featuresN/n2_featuresN.hdf5'  # <--- Replace with your HDF5 file path
    csv_output_directory = '/content/drive/MyDrive/Worms/Resultados/n2/n2_featuresN/Groups'  # <--- Replace with the desired output folder

    extract_coordinates(hdf5_file_path, csv_output_directory)


Extracting coordinates from: /content/drive/MyDrive/Worms/Resultados/n2/n2_featuresN/n2_featuresN.hdf5
CSV files will be saved in: /content/drive/MyDrive/Worms/Resultados/n2/n2_featuresN/Groups

Processing dataset: dorsal_contours
  Storing 3D coordinates 'dorsal_contours' as JSON strings in CSV.
  Dataset 'dorsal_contours' successfully exported to: /content/drive/MyDrive/Worms/Resultados/n2/n2_featuresN/Groups/dorsal_contours.csv

Processing dataset: skeletons
  Storing 3D coordinates 'skeletons' as JSON strings in CSV.
  Dataset 'skeletons' successfully exported to: /content/drive/MyDrive/Worms/Resultados/n2/n2_featuresN/Groups/skeletons.csv

Processing dataset: ventral_contours
  Storing 3D coordinates 'ventral_contours' as JSON strings in CSV.
  Dataset 'ventral_contours' successfully exported to: /content/drive/MyDrive/Worms/Resultados/n2/n2_featuresN/Groups/ventral_contours.csv

Coordinate extraction process complete.


2. Provenance_tracking extraction

In [7]:
def extract_provenance_tracking(file_path, output_dir):
    """
    Extracts the attributes from the 'provenance_tracking' group in an HDF5 file
    and saves them to a CSV file.

    Args:
        file_path (str): The path to the HDF5 file.
        output_dir (str): The directory where the CSV file will be saved.
    """
    print(f"\n{'='*30}")
    print(f"Extracting provenance_tracking from: {file_path}")
    print(f"CSV file will be saved in: {output_dir}")
    print(f"{'='*30}")

    try:
        if not os.path.exists(file_path):
            print(f"Error: The file '{file_path}' was not found.")
            return

        # Create the output directory if it doesn't exist
        os.makedirs(output_dir, exist_ok=True)

        with h5py.File(file_path, 'r') as hdf_file:
            if 'provenance_tracking' in hdf_file:
                provenance_group = hdf_file['provenance_tracking']
                attributes = {}
                for attr_name, attr_value in provenance_group.attrs.items():
                    attributes[attr_name] = attr_value
                df = pd.DataFrame([attributes])  # Create a DataFrame with a single row
                csv_file_path = os.path.join(output_dir, "provenance_tracking.csv")
                df.to_csv(csv_file_path, index=False)
                print(f"Provenance tracking data successfully exported to: {csv_file_path}")
            else:
                print(f"Warning: 'provenance_tracking' group not found in HDF5 file.")

    except Exception as e:
        print(f"An error occurred: {e}")

    print("\nProvenance tracking extraction process complete.")



if __name__ == "__main__":
    # Example usage:
    hdf5_file_path = '/content/drive/MyDrive/Worms/Resultados/n2/n2_featuresN/n2_featuresN.hdf5'  # <--- Replace with your HDF5 file path
    csv_output_directory = '/content/drive/MyDrive/Worms/Resultados/n2/n2_featuresN/Groups'  # <--- Replace with the desired output folder

    extract_provenance_tracking(hdf5_file_path, csv_output_directory)


Extracting provenance_tracking from: /content/drive/MyDrive/Worms/Resultados/n2/n2_featuresN/n2_featuresN.hdf5
CSV file will be saved in: /content/drive/MyDrive/Worms/Resultados/n2/n2_featuresN/Groups
Provenance tracking data successfully exported to: /content/drive/MyDrive/Worms/Resultados/n2/n2_featuresN/Groups/provenance_tracking.csv

Provenance tracking extraction process complete.


Visualization of each dataset

1. Top-Level Datasets

In [8]:
csv_output_directory = '/content/drive/MyDrive/Worms/Resultados/n2/n2_featuresN/Datasets'

print(f"\n{'='*30}")
print(f"Visualizing the first 5 rows of the datasets in: {csv_output_directory}")
print(f"{'='*30}")
try:
    # Check if the output directory exists
    if not os.path.exists(csv_output_directory):
        print(f"Error: The directory '{csv_output_directory}' was not found.")
        exit()

    # Iterate through all files in the specified directory
    for filename in os.listdir(csv_output_directory):
        if filename.endswith(".csv"):  # Check if the file is a CSV file
            file_path = os.path.join(csv_output_directory, filename)
            try:
                # Read the CSV file into a Pandas DataFrame
                df = pd.read_csv(file_path)

                print(f"\n--- File: {filename} ---")
                print("First 5 rows:")
                if not df.empty:
                    print(df.head())
                else:
                    print("The DataFrame is empty.")

            except pd.errors.EmptyDataError:
                print(f"Warning: The file '{filename}' is empty.")
            except Exception as e:
                print(f"Error reading the file '{filename}': {e}")
except Exception as e:
    print(f"An error occurred: {e}")

print("\nDataset visualization process completed.")


Visualizing the first 5 rows of the datasets in: /content/drive/MyDrive/Worms/Resultados/n2/n2_featuresN/Datasets

--- File: blob_features.csv ---
First 5 rows:
       coord_x      coord_y   area   perimeter  box_length  box_width  \
0  1647.895996  1662.704834  368.5  140.124893   54.063095  17.351398   
1  1648.061401  1662.817749  354.5  138.124893   53.835423  16.643318   
2  1647.644165  1662.226440  347.0  136.225403   53.534157  16.949924   
3  1647.155151  1660.760986  359.0  137.539108   51.894089  16.069649   
4  1647.464355  1660.255371  371.5  136.468033   54.126434  15.504248   

   quirkiness  compactness  box_orientation  solidity  intensity_mean  \
0    0.947097     0.235840       -32.125000  0.491333      114.920746   
1    0.951013     0.233497       -32.735230  0.494766      114.497580   
2    0.948553     0.234976       -32.471190  0.487018      113.791150   
3    0.950847     0.238480       -25.016895  0.504923      114.124405   
4    0.958097     0.250673       -

2. Datasets extracted from the groups

In [9]:
csv_output_directory = '/content/drive/MyDrive/Worms/Resultados/n2/n2_featuresN/Groups'

print(f"\n{'='*30}")
print(f"Visualizing the first 5 rows of the datasets in: {csv_output_directory}")
print(f"{'='*30}")
try:
    # Check if the output directory exists
    if not os.path.exists(csv_output_directory):
        print(f"Error: The directory '{csv_output_directory}' was not found.")
        exit()

    # Iterate through all files in the specified directory
    for filename in os.listdir(csv_output_directory):
        if filename.endswith(".csv"):  # Check if the file is a CSV file
            file_path = os.path.join(csv_output_directory, filename)
            try:
                # Read the CSV file into a Pandas DataFrame
                df = pd.read_csv(file_path)

                print(f"\n--- File: {filename} ---")
                print("First 5 rows:")
                if not df.empty:
                    print(df.head())
                else:
                    print("The DataFrame is empty.")

            except pd.errors.EmptyDataError:
                print(f"Warning: The file '{filename}' is empty.")
            except Exception as e:
                print(f"Error reading the file '{filename}': {e}")
except Exception as e:
    print(f"An error occurred: {e}")

print("\nDataset visualization process completed.")


Visualizing the first 5 rows of the datasets in: /content/drive/MyDrive/Worms/Resultados/n2/n2_featuresN/Groups

--- File: provenance_tracking.csv ---
First 5 rows:
      CLASS                     TITLE VERSION
0  b'GROUP'  Empty(dtype=dtype('S1'))  b'1.0'

--- File: dorsal_contours.csv ---
First 5 rows:
                                     dorsal_contours
0  [[1641.0015869140625, 1636.00830078125], [1639...
1  [[1640.0, 1636.0], [1638.9564208984375, 1637.0...
2  [[1640.0167236328125, 1635.99853515625], [1638...
3  [[1642.9984130859375, 1634.99609375], [1641.51...
4  [[1642.0, 1633.0], [1640.9749755859375, 1634.0...

--- File: skeletons.csv ---
First 5 rows:
                                           skeletons
0  [[1641.0260009765625, 1636.0228271484375], [16...
1  [[1639.95947265625, 1635.9832763671875], [1640...
2  [[1640.028564453125, 1636.0118408203125], [163...
3  [[1642.9951171875, 1634.9979248046875], [1643....
4  [[1641.9586181640625, 1632.9881591796875], [16...

--- File: ven