# Goal
We have downloaded the data for the focus regions and now want to filter it.
- We only want to keep the data for the focus region bounding box and a buffer zone around it
- We keep data that has raised no quality flags
- We keep data for the "open_water" class (class 4) and the "water_near_land" class (class 3)

In [6]:
# Standard library imports
from concurrent.futures import ThreadPoolExecutor
from datetime import datetime
import json
from pathlib import Path
import os
import glob
import zipfile
from functools import partial

# Third-party library imports
import geopandas as gpd
import pandas as pd
import contextily as ctx
import xarray as xr
import numpy as np
import matplotlib.pyplot as plt
import netCDF4
import earthaccess

import gc

# Matplotlib inline magic command
%matplotlib inline

# Load the configuration file which we created in the previous step
try:
    with open('config.json', 'r') as f:
        config = json.load(f)
except FileNotFoundError:
    print("config.json not found. Please ensure that the 01 notebook's first two code blocks have been run to generate the config.json file.")

# Create output directory if needed
output_dir = Path(config['data_dir']) / 'output'
output_dir.mkdir(exist_ok=True)

# Define class mapping
class_mapping = {3: 'water', 4: 'land'}

print("Libraries loaded and configuration set.")

Libraries loaded and configuration set.


In [7]:
# This dictionary explicitely outlines the data columns and their data types
# It is mainyl intended to be read and understood by the user.

data_columns = {
    "classification": {
        "type": int,
        "null_value": 255,
        "description": "Flags indicating water detection results.",
    },
    "classification_str": {
        "type": str,
        "null_value": "unknown",
        "value_map": {
            1: "land",
            2: "land_near_water ",
            3: "water_near_land ",
            4: "open_water ",
            5: "dark_water  ",
            6: "low_coh_water_near_land  ",
            7: "open_low_coh_water "
        },
        "description": "Flags indicating water detection results as string.",
    },
    "layover_impact": {
        "type": float,
        "null_value": 9.969209968386869e+36,
        "description": "Estimate of the height error caused by layover, which may not be reliable on a pixel by pixel basis, but may be useful to augment aggregated height uncertainties. ",
    },
    "height": {
        "type": float,
        "null_value": 9.96921e+36,
        "description": "Height of the pixel above the reference ellipsoid.",
    },
    "illumination_time": {
        "type": datetime,
        "null_value": 9.969209968386869e+36,
        "description": "Time of measurement in seconds in the UTC time scale since 1 Jan 2000 00:00:00 UTC. [tai_utc_difference] is the difference between TAI and UTC reference time (seconds) for the first measurement of the data set. If a leap second occurs within the data set, the attribute leap_second is set to the UTC time at which the leap second occurs. ",
    },
	"geolocation_qual": {
        "type": int,
        "null_value": 4294967295,
        "description": "Quality flag for the geolocation quantities in the pixel cloud data",
		"flag_masks": {
            "layover_significant": 1,
            "phase_noise_suspect": 2,
            "phase_unwrapping_suspect": 4,
            "model_dry_tropo_cor_suspect": 8,
            "model_wet_tropo_cor_suspect": 16,
            "iono_cor_gim_ka_suspect": 32,
            "xovercal_suspect": 64,
            "suspect_karin_telem": 1024,
            "medium_phase_suspect": 4096,
            "tvp_suspect": 8192,
            "sc_event_suspect": 16384,
            "small_karin_gap": 32768,
            "specular_ringing_degraded": 524288,
            "model_dry_tropo_cor_missing": 1048576,
            "model_wet_tropo_cor_missing": 2097152,
            "iono_cor_gim_ka_missing": 4194304,
            "xovercal_missing": 8388608,
            "geolocation_is_from_refloc": 16777216,
            "no_geolocation_bad": 33554432,
            "medium_phase_bad": 67108864,
            "tvp_bad": 134217728,
            "sc_event_bad": 268435456,
            "large_karin_gap": 536870912
        }
    },
	"pixc_line_qual": {},
}

---

In [8]:
# Load the bounding box
bbox_path = Path(config['data_dir']) / 'bbox.geojson'
with open(bbox_path, 'r') as f:
    bbox_data = json.load(f)
bbox_gdf = gpd.GeoDataFrame.from_features(bbox_data['features'])

# Get bounds for filtering
min_lat = bbox_gdf.bounds.miny[0]
max_lat = bbox_gdf.bounds.maxy[0]
min_lon = bbox_gdf.bounds.minx[0]
max_lon = bbox_gdf.bounds.maxx[0]

print(f"Bounding box loaded: Lat [{min_lat}, {max_lat}], Lon [{min_lon}, {max_lon}]")

Bounding box loaded: Lat [-0.819408861907936, -0.037710351979959], Lon [36.8584632059703, 37.2444933070158]


In [None]:
def process_file_fast(file_path, output_path, append=False):
    """
    Process a netCDF file with direct array indexing to minimize memory usage.
    """
    try:
        # Open the dataset
        with xr.open_dataset(file_path, group='pixel_cloud') as ds:
            # Get the coordinates we need
            if 'latitude' not in ds.variables or 'longitude' not in ds.variables:
                print(f"\tSkipping {file_path.name} - missing lat/lon variables")
                return 0
                
            # Get dimensions
            if 'points' in ds.sizes:
                main_dim = 'points'
            else:
                dims = list(ds.sizes.keys())
                print(f"\tAvailable dimensions: {dims}")
                for dim in ['time', 'record']:
                    if dim in dims:
                        main_dim = dim
                        break
                else:
                    # If no recognized dimension, use the largest
                    main_dim = max(ds.sizes.items(), key=lambda x: x[1])[0]
                    
            dim_size = ds.sizes[main_dim]
            print(f"\tUsing dimension '{main_dim}' with {dim_size} points")

            # Get variables we need
            needed_vars = ['latitude', 'longitude', 'classification']
            missing_vars = [var for var in needed_vars if var not in ds.variables]
            if missing_vars:
                print(f"  Skipping {file_path.name} - missing variables: {missing_vars}")
                return 0

            # Direct indexing with numpy for spatial filtering
            lat_array = ds.latitude.values
            lon_array = ds.longitude.values
            class_array = ds.classification.values
            
            # Create a mask for spatial filtering
            spatial_mask = (
                (lat_array >= min_lat) & 
                (lat_array <= max_lat) & 
                (lon_array >= min_lon) & 
                (lon_array <= max_lon)
            )
            
            # Further filter by classification (3 or 4)
            class_mask = np.logical_or(class_array == 3, class_array == 4)
            
            # Combine masks
            final_mask = np.logical_and(spatial_mask, class_mask)
            
            # Count results
            total_points = len(lat_array)
            spatial_points = np.sum(spatial_mask)
            final_points = np.sum(final_mask)
            
            print(f"\tProcessing {file_path.name}: Total={total_points}, In bbox={spatial_points}, Class 3/4={final_points}")
            
            # If we have points, save them
            if final_points > 0:
                # Create a dataframe with only the filtered points
                df = pd.DataFrame({
                    'latitude': lat_array[final_mask],
                    'longitude': lon_array[final_mask],
                    'classification': class_array[final_mask]
                })
                
                # Add classification string
                df['classification_str'] = df['classification'].map(class_mapping)
                
                # Handle appending
                if append and os.path.exists(output_path):
                    # Read existing data
                    try:
                        existing_df = pd.read_parquet(output_path)
                        # Combine with new data
                        combined_df = pd.concat([existing_df, df], ignore_index=True)
                        # Save combined data
                        combined_df.to_parquet(output_path, engine='pyarrow', compression='snappy', index=False)
                    except Exception as append_err:
                        print(f"\tWarning: Could not append to existing file ({str(append_err)}). Creating new file.")
                        df.to_parquet(output_path, engine='pyarrow', compression='snappy', index=False)
                else:
                    # Save directly without append
                    df.to_parquet(output_path, engine='pyarrow', compression='snappy', index=False)
                
                return final_points
            else:
                return 0
                
    except Exception as e:
        print(f"\tError processing {file_path.name}: {str(e)}")
        return 0

In [None]:
# Get list of files to process
pixel_cloud_dir = Path(config['water_mask_pixel_cloud_dir'])
file_list = list(pixel_cloud_dir.glob('*.nc'))
total_files = len(file_list)
print(f"Found {total_files} files to process")

# Set up output file and directory
output_file = Path(config['output_file'])
output_file.parent.mkdir(exist_ok=True)

# Optional: Process smaller batch for testing
# file_list = file_list[:10]  # Uncomment to test with fewer files
# total_files = len(file_list)

# Initialize counters
total_points = 0
processed_files = 0

# Remove existing output file to start fresh
if output_file.exists():
    output_file.unlink()
    print(f"Removed existing output file: {output_file}")

# Create empty dataframe for accumulating results
all_data = pd.DataFrame()

# Process all files
for i, file_path in enumerate(file_list):
    print(f"\nProcessing file {i+1}/{total_files}: {file_path.name}")
    
    # Process the file
    try:
        points = process_file_fast(file_path, output_file, append=(i > 0))
        total_points += points
        processed_files += 1
    except Exception as e:
        print(f"Error processing file: {str(e)}")
    
    # Clear memory
    gc.collect()
    
    # Status update every 10 files
    if (i+1) % 10 == 0 or (i+1) == total_files:
        print(f"\nStatus: Processed {i+1}/{total_files} files, found {total_points} points so far")

print(f"\nProcessing complete! Found {total_points} points in {processed_files} files.")

Found 148 files to process

Processing file 1/148: SWOT_L2_HR_PIXC_021_165_154R_20240916T093816_20240916T093827_PIC0_01.nc
Using dimension 'points' with 4681946 points
Processing SWOT_L2_HR_PIXC_021_165_154R_20240916T093816_20240916T093827_PIC0_01.nc: Total=4681946, In bbox=854814, Class 3/4=11599

Processing file 2/148: SWOT_L2_HR_PIXC_018_012_154R_20240710T081133_20240710T081144_PIC0_01.nc
Using dimension 'points' with 3471897 points
Processing SWOT_L2_HR_PIXC_018_012_154R_20240710T081133_20240710T081144_PIC0_01.nc: Total=3471897, In bbox=447, Class 3/4=1

Processing file 3/148: SWOT_L2_HR_PIXC_018_318_156L_20240721T063442_20240721T063453_PIC0_02.nc
Using dimension 'points' with 8153189 points
Processing SWOT_L2_HR_PIXC_018_318_156L_20240721T063442_20240721T063453_PIC0_02.nc: Total=8153189, In bbox=3475012, Class 3/4=64776

Processing file 4/148: SWOT_L2_HR_PIXC_018_165_154L_20240715T192302_20240715T192313_PIC0_01.nc
Using dimension 'points' with 3125903 points
Processing SWOT_L2_HR_

In [11]:
# Check if we should convert to GeoJSON
if total_points > 0:
    # Ask user if they want to convert
    convert = input(f"Convert {total_points} points to GeoJSON? This might require significant memory (y/n): ")
    
    if convert.lower() == 'y':
        try:
            # Create output directory
            output_dir = Path(config['data_dir']) / 'output'
            output_dir.mkdir(exist_ok=True)
            
            # Create GeoJSON filename
            geojson_file = output_dir / f'subset_class3and4_{datetime.now().strftime("%Y%m%d%H%M%S")}.geojson'
            
            print(f"Loading data from {output_file}...")
            df = pd.read_parquet(output_file)
            print(f"Converting {len(df)} points to GeoDataFrame...")
            
            # Create GeoDataFrame
            gdf = gpd.GeoDataFrame(
                df, 
                geometry=gpd.points_from_xy(df.longitude, df.latitude),
                crs="EPSG:4326"
            )
            
            # Fix for index64 pandas issue
            gdf["row_id"] = gdf.index + 1
            gdf.reset_index(drop=True, inplace=True)
            gdf.set_index("row_id", inplace=True)
            
            print(f"Saving to {geojson_file}...")
            gdf.to_file(geojson_file, driver='GeoJSON')
            print(f"GeoJSON file saved to {geojson_file}")
            
        except Exception as e:
            print(f"Error converting to GeoJSON: {str(e)}")
else:
    print("No points found, skipping GeoJSON conversion.")

Loading data from data/processed_data.parquet...
Converting 3373905 points to GeoDataFrame...
Saving to data/output/subset_class3and4_20250414133915.geojson...
GeoJSON file saved to data/output/subset_class3and4_20250414133915.geojson
