# Goal
We have downloaded the data for the focus regions and now want to filter it.
- We only want to keep the data for the focus region bounding box and a buffer zone around it
- We keep data that has raised no quality flags
- We keep data for the "open_water" class (class 4) and the "water_near_land" class (class 3)

In [1]:
# Standard library imports
from concurrent.futures import ThreadPoolExecutor
from datetime import datetime
import json
from pathlib import Path
import os
import glob
import zipfile
from functools import partial

# Third-party library imports
import geopandas as gpd
import pandas as pd
import contextily as ctx
import xarray as xr
import numpy as np
import matplotlib.pyplot as plt
import netCDF4
import earthaccess

# Matplotlib inline magic command
%matplotlib inline

# Load the configuration file which we created in the previous step
try:
    with open('config.json', 'r') as f:
        config = json.load(f)
except FileNotFoundError:
    print("config.json not found. Please ensure that the 01 notebook's first two code blocks have been run to generate the config.json file.")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# This dictionary explicitely outlines the data columns and their data types
# It is mainyl intended to be read and understood by the user.

data_columns = {
    "classification": {
        "type": int,
        "null_value": 255,
        "description": "Flags indicating water detection results.",
    },
    "classification_str": {
        "type": str,
        "null_value": "unknown",
        "value_map": {
            1: "land",
            2: "land_near_water ",
            3: "water_near_land ",
            4: "open_water ",
            5: "dark_water  ",
            6: "low_coh_water_near_land  ",
            7: "open_low_coh_water "
        },
        "description": "Flags indicating water detection results as string.",
    },
    "layover_impact": {
        "type": float,
        "null_value": 9.969209968386869e+36,
        "description": "Estimate of the height error caused by layover, which may not be reliable on a pixel by pixel basis, but may be useful to augment aggregated height uncertainties. ",
    },
    "height": {
        "type": float,
        "null_value": 9.96921e+36,
        "description": "Height of the pixel above the reference ellipsoid.",
    },
    "illumination_time": {
        "type": datetime,
        "null_value": 9.969209968386869e+36,
        "description": "Time of measurement in seconds in the UTC time scale since 1 Jan 2000 00:00:00 UTC. [tai_utc_difference] is the difference between TAI and UTC reference time (seconds) for the first measurement of the data set. If a leap second occurs within the data set, the attribute leap_second is set to the UTC time at which the leap second occurs. ",
    },
	"geolocation_qual": {
        "type": int,
        "null_value": 4294967295,
        "description": "Quality flag for the geolocation quantities in the pixel cloud data",
		"flag_masks": {
            "layover_significant": 1,
            "phase_noise_suspect": 2,
            "phase_unwrapping_suspect": 4,
            "model_dry_tropo_cor_suspect": 8,
            "model_wet_tropo_cor_suspect": 16,
            "iono_cor_gim_ka_suspect": 32,
            "xovercal_suspect": 64,
            "suspect_karin_telem": 1024,
            "medium_phase_suspect": 4096,
            "tvp_suspect": 8192,
            "sc_event_suspect": 16384,
            "small_karin_gap": 32768,
            "specular_ringing_degraded": 524288,
            "model_dry_tropo_cor_missing": 1048576,
            "model_wet_tropo_cor_missing": 2097152,
            "iono_cor_gim_ka_missing": 4194304,
            "xovercal_missing": 8388608,
            "geolocation_is_from_refloc": 16777216,
            "no_geolocation_bad": 33554432,
            "medium_phase_bad": 67108864,
            "tvp_bad": 134217728,
            "sc_event_bad": 268435456,
            "large_karin_gap": 536870912
        }
    },
	"pixc_line_qual": {},
}

---

In [3]:
# Define a thread-safe processing function with progress reporting
def process_file(filepath, file_index, total_files):
    try:
        # Calculate percentage complete for this file
        percent_complete = (file_index / total_files) * 100

        # Load data
        ds = xr.open_dataset(filepath, group='pixel_cloud')

        # Load the bounding box
        bbox_path = Path(config['data_dir']) / 'bbox.geojson'
        with open(bbox_path, 'r') as f:
            bbox_data = json.load(f)
        bbox_gdf = gpd.GeoDataFrame.from_features(bbox_data['features'])

        # Process data
        cols = [col for col in data_columns.keys() if col in ds.variables]
        df = ds[cols].to_dataframe()
        full_len = len(df)

        # Apply filters
        df = df[(df['latitude'] >= bbox_gdf.bounds.miny[0]) &
                (df['latitude'] <= bbox_gdf.bounds.maxy[0]) &
                (df['longitude'] >= bbox_gdf.bounds.minx[0]) &
                (df['longitude'] <= bbox_gdf.bounds.maxx[0])]

        spatial_filtered = len(df)

        df = df[(df['classification'] == 3) | (df['classification'] == 4)]
        class_filtered = len(df)

        df['classification_str'] = df['classification'].replace(data_columns['classification_str']['value_map'])

        print(f"[{percent_complete:.1f}%] Processed {filepath.name}: {class_filtered}/{spatial_filtered}/{full_len} records")
        return df
    except Exception as e:
        print(f"[{percent_complete:.1f}%] ERROR: {e} - Skipping file {filepath}")
        raise e

# Main processing logic
pixel_cloud_dir = Path(config['water_mask_pixel_cloud_dir'])
file_list = list(pixel_cloud_dir.glob('*.nc'))
total_files = len(file_list)
print(f"[INFO] Found {total_files} files to process - reported percentage only approximate due to threading.")

results = []

# Attempt parallel processing first
try:
    raise MemoryError("Simulated memory error to ensure sequential processing. Bypass only on high memory systems.")
    with ThreadPoolExecutor(max_workers=min(os.cpu_count() * 2, 16)) as executor:
        # Submit tasks with index information
        futures = [executor.submit(process_file, filepath, i+1, total_files)
                   for i, filepath in enumerate(file_list)]
        for future in futures:
            try:
                results.append(future.result())
            except Exception as e:
                print(f"[ERROR] Error in worker: {e}")
                raise e

except MemoryError as me:
    print("[WARNING] Parallel processing failed due to limited system memory. Falling back to sequential processing.")
    results = []
    for i, filepath in enumerate(file_list):
        df = process_file(filepath, i+1, total_files)
        results.append(df)
except Exception as e:
    print(f"[ERROR] Parallel processing failed due to: {e}. Falling back to sequential processing.")
    results = []
    for i, filepath in enumerate(file_list):
        df = process_file(filepath, i+1, total_files)
        results.append(df)

# Combine results
final_df = pd.concat(results)
print(f"[INFO] Processing complete. Final dataset contains {len(final_df)} entries.")


[INFO] Found 148 files to process - reported percentage only approximate due to threading.
[0.7%] ERROR: Unable to allocate 57.3 GiB for an array with shape (4681946, 3284) and data type float32 - Skipping file data/Water mask pixel cloud 3/SWOT_L2_HR_PIXC_021_165_154R_20240916T093816_20240916T093827_PIC0_01.nc


MemoryError: Unable to allocate 57.3 GiB for an array with shape (4681946, 3284) and data type float32

In [None]:
# Save the final dataframe to a geojson file (this may take a while)
gdf = gpd.GeoDataFrame(final_df, geometry=gpd.points_from_xy(final_df.longitude, final_df.latitude))
gdf.crs = "EPSG:4326"
print(gdf.dtypes)

output_dir = Path(config['data_dir']) / 'output'
output_dir.mkdir(exist_ok=True)
output_file_name = output_dir / f'subset_class3and4_2020to2024_{datetime.now().strftime("%Y%m%d%H%M%S")}.geojson'

# sloppy fix for index64 pandas issue
gdf["row_id"] = gdf.index + 1
gdf.reset_index(drop=True, inplace=True)
gdf.set_index("row_id", inplace = True)

# save to geojson
gdf.to_file(output_file_name, driver='GeoJSON')
print(f"File saved to {output_file_name}")

File saved to data/output/subset_class3and4_2020to2024_20250309130642.geojson
