# Explore HDF5/MAT File Contents

This notebook explores what data is available in the HDF5/MAT files to see if we can eliminate the CSV dependency for geographic coordinates.

In [1]:
import sys
from pathlib import Path
import h5py
import scipy.io
import numpy as np
import pandas as pd

# Add src to path for local development
sys.path.insert(0, str(Path.cwd().parent / "src"))

from xopr.stac import discover_campaigns, discover_flight_lines

In [2]:
# Get a sample MAT file to explore
data_root = Path("/home/thomasteisberg/Documents/opr/opr_test_dataset_1")
campaigns = discover_campaigns(data_root)

print(f"Found {len(campaigns)} campaigns")

# Get first campaign and first flight
campaign = campaigns[0]
campaign_path = Path(campaign['path'])
flight_lines = discover_flight_lines(campaign_path, "CSARP_standard")

print(f"Campaign: {campaign['name']}")
print(f"Found {len(flight_lines)} flight lines")

# Get first MAT file
first_flight = flight_lines[0]
mat_file = Path(first_flight['mat_files'][0])
csv_file = Path(first_flight['csv_file'])

print(f"\nExploring MAT file: {mat_file.name}")
print(f"Corresponding CSV: {csv_file.name}")
print(f"MAT file exists: {mat_file.exists()}")
print(f"CSV file exists: {csv_file.exists()}")

Found 2 campaigns
Campaign: 2016_Antarctica_DC8
Found 55 flight lines

Exploring MAT file: Data_20161014_03_001.mat
Corresponding CSV: Data_20161014_03.csv
MAT file exists: True
CSV file exists: True


In [3]:
# Function to safely explore HDF5/MAT file contents
def explore_mat_file(file_path):
    """Explore contents of MAT/HDF5 file"""
    print(f"Exploring file: {file_path}")
    
    # Try HDF5 first
    try:
        print("\n=== Trying H5PY (HDF5 format) ===")
        with h5py.File(file_path, 'r') as f:
            print("Successfully opened with h5py")
            print(f"Keys: {list(f.keys())}")
            
            for key in f.keys():
                try:
                    data = f[key]
                    if hasattr(data, 'shape'):
                        print(f"  {key}: shape={data.shape}, dtype={data.dtype}")
                    else:
                        print(f"  {key}: {type(data)}")
                except Exception as e:
                    print(f"  {key}: Error reading - {e}")
        return True
                    
    except Exception as e:
        print(f"H5PY failed: {e}")
    
    # Try scipy.io for older MATLAB files
    try:
        print("\n=== Trying SCIPY.IO (MATLAB format) ===")
        data = scipy.io.loadmat(file_path, mat_dtype=True)
        print("Successfully opened with scipy.io")
        print(f"Keys: {list(data.keys())}")
        
        for key, value in data.items():
            if not key.startswith('__'):
                if hasattr(value, 'shape'):
                    print(f"  {key}: shape={value.shape}, dtype={value.dtype}")
                else:
                    print(f"  {key}: {type(value)}")
        return True
        
    except Exception as e:
        print(f"SCIPY.IO failed: {e}")
        return False

# Explore the first MAT file
success = explore_mat_file(mat_file)

Exploring file: /home/thomasteisberg/Documents/opr/opr_test_dataset_1/2016_Antarctica_DC8/CSARP_standard/20161014_03/Data_20161014_03_001.mat

=== Trying H5PY (HDF5 format) ===
Successfully opened with h5py
Keys: ['#refs#', 'Bottom', 'Data', 'Elevation', 'GPS_time', 'Heading', 'Latitude', 'Longitude', 'Pitch', 'Roll', 'Surface', 'Time', 'param_combine', 'param_csarp', 'param_records']
  #refs#: <class 'h5py._hl.group.Group'>
  Bottom: shape=(195, 1), dtype=float64
  Data: shape=(195, 2781), dtype=float32
  Elevation: shape=(195, 1), dtype=float64
  GPS_time: shape=(195, 1), dtype=float64
  Heading: shape=(195, 1), dtype=float64
  Latitude: shape=(195, 1), dtype=float64
  Longitude: shape=(195, 1), dtype=float64
  Pitch: shape=(195, 1), dtype=float64
  Roll: shape=(195, 1), dtype=float64
  Surface: shape=(195, 1), dtype=float64
  Time: shape=(1, 2781), dtype=float64
  param_combine: <class 'h5py._hl.group.Group'>
  param_csarp: <class 'h5py._hl.group.Group'>
  param_records: <class 'h5p

In [4]:
# Look for GPS/coordinate data in the MAT file
def find_gps_data(file_path):
    """Look for GPS coordinate data in MAT file"""
    print(f"Searching for GPS data in: {file_path.name}")
    
    # Try both formats
    for method_name, loader in [("H5PY", h5py.File), ("SCIPY", scipy.io.loadmat)]:
        try:
            print(f"\n--- Using {method_name} ---")
            
            if method_name == "H5PY":
                with loader(file_path, 'r') as data:
                    keys = list(data.keys())
            else:
                data = loader(file_path, mat_dtype=True)
                keys = [k for k in data.keys() if not k.startswith('__')]
            
            print(f"All keys: {keys}")
            
            # Look for GPS-related keys
            gps_keys = [k for k in keys if 'gps' in k.lower() or 'lat' in k.lower() or 'lon' in k.lower()]
            print(f"GPS-related keys: {gps_keys}")
            
            # Look for known GPS fields
            common_gps_fields = ['GPS_time', 'Latitude', 'Longitude', 'lat', 'lon', 'GPS']
            found_fields = [k for k in keys if k in common_gps_fields]
            print(f"Common GPS fields found: {found_fields}")
            
            # Examine GPS_time that we know exists
            if 'GPS_time' in keys:
                if method_name == "H5PY":
                    gps_time_data = data['GPS_time']
                    print(f"GPS_time: shape={gps_time_data.shape}, dtype={gps_time_data.dtype}")
                    # Show first few values
                    time_vals = np.array(gps_time_data)
                    print(f"GPS_time sample values: {time_vals.flatten()[:5]}")
                else:
                    gps_time_data = data['GPS_time']
                    print(f"GPS_time: shape={gps_time_data.shape}, dtype={gps_time_data.dtype}")
                    print(f"GPS_time sample values: {gps_time_data.flatten()[:5]}")
            
            # Look for any field that might contain coordinate arrays
            coord_candidates = []
            for key in keys:
                try:
                    if method_name == "H5PY":
                        field_data = data[key]
                        if hasattr(field_data, 'shape') and len(field_data.shape) > 0:
                            shape = field_data.shape
                    else:
                        field_data = data[key]
                        if hasattr(field_data, 'shape') and len(field_data.shape) > 0:
                            shape = field_data.shape
                    
                    # Look for arrays that could be coordinates
                    if len(shape) >= 1 and shape[0] > 1:  # Has multiple points
                        coord_candidates.append((key, shape))
                        
                except Exception as e:
                    continue
            
            print(f"\nPotential coordinate arrays:")
            for key, shape in coord_candidates:
                print(f"  {key}: {shape}")
                
            return True
            
        except Exception as e:
            print(f"{method_name} failed: {e}")
            continue
    
    return False

# Search for GPS data
find_gps_data(mat_file)

Searching for GPS data in: Data_20161014_03_001.mat

--- Using H5PY ---
All keys: ['#refs#', 'Bottom', 'Data', 'Elevation', 'GPS_time', 'Heading', 'Latitude', 'Longitude', 'Pitch', 'Roll', 'Surface', 'Time', 'param_combine', 'param_csarp', 'param_records']
GPS-related keys: ['GPS_time', 'Latitude', 'Longitude']
Common GPS fields found: ['GPS_time', 'Latitude', 'Longitude']
H5PY failed: 'Unable to synchronously open object (invalid identifier type to function)'

--- Using SCIPY ---
SCIPY failed: Please use HDF reader for matlab v7.3 files, e.g. h5py


False

In [5]:
# Compare with CSV data to see what we're missing
print("=== CSV FILE CONTENTS ===")
csv_data = pd.read_csv(csv_file)
print(f"CSV shape: {csv_data.shape}")
print(f"CSV columns: {list(csv_data.columns)}")
print(f"\nFirst few rows:")
print(csv_data.head())

print(f"\nLAT range: {csv_data['LAT'].min():.6f} to {csv_data['LAT'].max():.6f}")
print(f"LON range: {csv_data['LON'].min():.6f} to {csv_data['LON'].max():.6f}")
print(f"Number of coordinate points: {len(csv_data)}")

=== CSV FILE CONTENTS ===
CSV shape: (195, 9)
CSV columns: ['LAT', 'LON', 'UTCTIMESOD', 'THICK', 'ELEVATION', 'FRAME', 'SURFACE', 'BOTTOM', 'QUALITY']

First few rows:
         LAT        LON  UTCTIMESOD    THICK  ELEVATION          FRAME  \
0 -71.349553 -69.859816  58337.4388 -9999.00  1101.5183  2016101403001   
1 -71.349681 -69.859688  58337.5434   642.62  1101.9702  2016101403001   
2 -71.349808 -69.859559  58337.6479   647.78  1102.4217  2016101403001   
3 -71.349936 -69.859430  58337.7523   652.43  1102.8757  2016101403001   
4 -71.350064 -69.859301  58337.8568   654.61  1103.3290  2016101403001   

   SURFACE   BOTTOM  QUALITY  
0 -9999.00 -9999.00      NaN  
1   502.77  1145.39      1.0  
2   499.66  1147.44      1.0  
3   497.47  1149.90      1.0  
4   499.65  1154.27      1.0  

LAT range: -71.374302 to -71.349553
LON range: -69.859816 to -69.834224
Number of coordinate points: 195


In [6]:
# Try to extract coordinate data from MAT file if it exists
def extract_coordinates_from_mat(file_path):
    """Try to extract lat/lon coordinates from MAT file"""
    coordinates = {'lat': None, 'lon': None, 'method': None}
    
    # Try H5PY first
    try:
        with h5py.File(file_path, 'r') as f:
            keys = list(f.keys())
            print(f"H5PY keys: {keys}")
            
            # Look for obvious coordinate fields
            lat_candidates = [k for k in keys if 'lat' in k.lower()]
            lon_candidates = [k for k in keys if 'lon' in k.lower()]
            
            print(f"Latitude candidates: {lat_candidates}")
            print(f"Longitude candidates: {lon_candidates}")
            
            # Try to extract data from candidates
            for lat_key in lat_candidates:
                try:
                    lat_data = np.array(f[lat_key])
                    print(f"  {lat_key}: shape={lat_data.shape}, range={lat_data.min():.6f} to {lat_data.max():.6f}")
                    if coordinates['lat'] is None and len(lat_data.shape) >= 1:
                        coordinates['lat'] = lat_data
                        coordinates['method'] = 'h5py'
                except Exception as e:
                    print(f"  Error reading {lat_key}: {e}")
            
            for lon_key in lon_candidates:
                try:
                    lon_data = np.array(f[lon_key])
                    print(f"  {lon_key}: shape={lon_data.shape}, range={lon_data.min():.6f} to {lon_data.max():.6f}")
                    if coordinates['lon'] is None and len(lon_data.shape) >= 1:
                        coordinates['lon'] = lon_data
                except Exception as e:
                    print(f"  Error reading {lon_key}: {e}")
            
            if coordinates['lat'] is not None and coordinates['lon'] is not None:
                return coordinates
                
    except Exception as e:
        print(f"H5PY approach failed: {e}")
    
    # Try scipy.io
    try:
        data = scipy.io.loadmat(file_path, mat_dtype=True)
        keys = [k for k in data.keys() if not k.startswith('__')]
        print(f"\nSCIPY keys: {keys}")
        
        # Look for coordinate fields
        lat_candidates = [k for k in keys if 'lat' in k.lower()]
        lon_candidates = [k for k in keys if 'lon' in k.lower()]
        
        print(f"Latitude candidates: {lat_candidates}")
        print(f"Longitude candidates: {lon_candidates}")
        
        for lat_key in lat_candidates:
            try:
                lat_data = np.array(data[lat_key])
                print(f"  {lat_key}: shape={lat_data.shape}, range={lat_data.min():.6f} to {lat_data.max():.6f}")
                if coordinates['lat'] is None and len(lat_data.shape) >= 1:
                    coordinates['lat'] = lat_data
                    coordinates['method'] = 'scipy'
            except Exception as e:
                print(f"  Error reading {lat_key}: {e}")
        
        for lon_key in lon_candidates:
            try:
                lon_data = np.array(data[lon_key])
                print(f"  {lon_key}: shape={lon_data.shape}, range={lon_data.min():.6f} to {lon_data.max():.6f}")
                if coordinates['lon'] is None and len(lon_data.shape) >= 1:
                    coordinates['lon'] = lon_data
            except Exception as e:
                print(f"  Error reading {lon_key}: {e}")
                
    except Exception as e:
        print(f"SCIPY approach failed: {e}")
    
    return coordinates

# Try to extract coordinates
mat_coords = extract_coordinates_from_mat(mat_file)

if mat_coords['lat'] is not None and mat_coords['lon'] is not None:
    print(f"\n✅ Found coordinates in MAT file using {mat_coords['method']}!")
    print(f"   LAT: {len(mat_coords['lat'])} points, range {mat_coords['lat'].min():.6f} to {mat_coords['lat'].max():.6f}")
    print(f"   LON: {len(mat_coords['lon'])} points, range {mat_coords['lon'].min():.6f} to {mat_coords['lon'].max():.6f}")
    
    # Compare with CSV
    print(f"\n📊 Comparison with CSV:")
    print(f"   MAT points: {len(mat_coords['lat'])}")
    print(f"   CSV points: {len(csv_data)}")
    
else:
    print(f"\n❌ No coordinate data found in MAT file")
    print(f"   CSV file is still needed for coordinates")

H5PY keys: ['#refs#', 'Bottom', 'Data', 'Elevation', 'GPS_time', 'Heading', 'Latitude', 'Longitude', 'Pitch', 'Roll', 'Surface', 'Time', 'param_combine', 'param_csarp', 'param_records']
Latitude candidates: ['Latitude']
Longitude candidates: ['Longitude']
  Latitude: shape=(195, 1), range=-71.374302 to -71.349553
  Longitude: shape=(195, 1), range=-69.859816 to -69.834224

✅ Found coordinates in MAT file using h5py!
   LAT: 195 points, range -71.374302 to -71.349553
   LON: 195 points, range -69.859816 to -69.834224

📊 Comparison with CSV:
   MAT points: 195
   CSV points: 195
