In [2]:
import struct
import os
import gzip
import json

# Specify the path to the binary annotation files
shard_directory = "/net/birdstore/Active_Atlas_Data/data_root/pipeline_data/CTB015/www/neuroglancer_data/annotations/ML_POSITIVE.precomputed_ann1/by_id/"

# List all files in the shard directory
shard_files = os.listdir(shard_directory)

# Read the first few bytes from each shard to confirm the structure
for shard_file in shard_files:
    file_path = os.path.join(shard_directory, shard_file)
    print(f"Reading {file_path}...")

    if os.path.isfile(file_path):
        try:
            # Open and decompress the GZIP file
            with gzip.open(file_path, "rb") as f:
                # Read the decompressed data
                minishard_index_data = f.read()
                
                # Print the first few bytes of the decompressed data to understand its structure
                print(f"Raw data from {file_path[:50]}...: {minishard_index_data[:200]}")

                # Attempt to decode the raw data as a UTF-8 string
                try:
                    annotations = minishard_index_data.decode('utf-8')
                    print(f"Decompressed content: {annotations[:500]}")  # Print first 500 characters
                    
                except UnicodeDecodeError as e:
                    print(f"Error decoding with UTF-8: {e}. Trying 'latin-1' encoding.")
                    try:
                        # Try 'latin-1' encoding
                        annotations = minishard_index_data.decode('latin-1')
                    except Exception as e:
                        print(f"Error decoding with 'latin-1': {e}")
                        continue  # Skip this file if both decoding attempts fail

                # Process annotations (split by '}{')
                annotation_list = annotations.strip().split('}{')
                annotation_list[0] = annotation_list[0][1:]  # Remove the first '{'
                annotation_list[-1] = annotation_list[-1][:-1]  # Remove the last '}'
                
                for annotation_str in annotation_list:
                    try:
                        annotation = json.loads("{" + annotation_str + "}")
                        if 'point' in annotation:
                            point = annotation["point"]
                            print(f"Annotation ID: {annotation['id']}, Point: {point}")
                            print(f"Point (µm): ({point[0]:.3f}, {point[1]:.3f}, {point[2]:.3f})")
                            print(f"Point (Voxels): ({point[0]/0.325:.3f}, {point[1]/0.325:.3f}, {point[2]/20:.3f})")
                            print('-' * 40)
                        else:
                            print("No 'point' key in this annotation!")
                    except json.JSONDecodeError as e:
                        print(f"Error decoding JSON annotation: {e}")
                
        except Exception as e:
            print(f"Error reading or decompressing file {file_path}: {e}")

print("DONE")


Reading /net/birdstore/Active_Atlas_Data/data_root/pipeline_data/CTB015/www/neuroglancer_data/annotations/ML_POSITIVE.precomputed_ann1/by_id/0000000000000000...
Raw data from /net/birdstore/Active_Atlas_Data/data_root/pipelin...: b'{"@type": "neuroglancer_annotation", "id": "0", "point": [6336850.0, 4209725.0, 1400000.0], "type": "point", "properties": {"radius": 200.0}}'
Decompressed content: {"@type": "neuroglancer_annotation", "id": "0", "point": [6336850.0, 4209725.0, 1400000.0], "type": "point", "properties": {"radius": 200.0}}
Annotation ID: 0, Point: [6336850.0, 4209725.0, 1400000.0]
Point (µm): (6336850.000, 4209725.000, 1400000.000)
Point (Voxels): (19498000.000, 12953000.000, 70000.000)
----------------------------------------
Reading /net/birdstore/Active_Atlas_Data/data_root/pipeline_data/CTB015/www/neuroglancer_data/annotations/ML_POSITIVE.precomputed_ann1/by_id/0000000000000002...
Raw data from /net/birdstore/Active_Atlas_Data/data_root/pipelin...: b'{"@type": "neuroglanc

In [7]:
import os

# Specify the path to the binary annotation files
shard_directory = "/net/birdstore/Active_Atlas_Data/data_root/pipeline_data/CTB015/www/neuroglancer_data/annotations/ML_POSITIVE.precomputed_ann/by_id/"

# List all files in the shard directory
shard_files = os.listdir(shard_directory)

# Read the first few bytes from each shard to check the format
for shard_file in shard_files:
    file_path = os.path.join(shard_directory, shard_file)
    print(f"Reading {file_path}...")

    if os.path.isfile(file_path):
        try:
            with open(file_path, "rb") as f:
                # Read the first 16 bytes (you can increase this number if needed)
                first_bytes = f.read(16)
                
                # Print the first 16 bytes as hexadecimal
                print(f"First 16 bytes: {' '.join(f'{byte:02x}' for byte in first_bytes)}")
        except Exception as e:
            print(f"Error reading file {file_path}: {e}")

print("DONE")


Reading /net/birdstore/Active_Atlas_Data/data_root/pipeline_data/CTB015/www/neuroglancer_data/annotations/ML_POSITIVE.precomputed_ann/by_id/0000000000000001...
First 16 bytes: 1f 8b 08 00 ad 4f 1d 68 02 ff ab 56 72 28 a9 2c
Reading /net/birdstore/Active_Atlas_Data/data_root/pipeline_data/CTB015/www/neuroglancer_data/annotations/ML_POSITIVE.precomputed_ann/by_id/0000000000000000...
First 16 bytes: 1f 8b 08 00 ad 4f 1d 68 02 ff ab 56 72 28 a9 2c
DONE
