In [23]:
# Standard library imports
from datetime import datetime
import json
from pathlib import Path
import os
import glob
import zipfile

# Third-party library imports
import geopandas as gpd
import pandas as pd
import polars as pl
import contextily as ctx
import xarray as xr
import numpy as np
import matplotlib.pyplot as plt
import netCDF4
import earthaccess

# Matplotlib inline magic command
%matplotlib inline

with open('config.json', 'r') as f:
    config = json.load(f)

In [24]:
data_columns = {
    "classification": {
        "type": int,
        "null_value": 255,
        "description": "Flags indicating water detection results.",
    },
    "classification_str": {
        "type": str,
        "null_value": "unknown",
        "value_map": {
            1: "land",
            2: "land_near_water ",
            3: "water_near_land ",
            4: "open_water ",
            5: "dark_water  ",
            6: "low_coh_water_near_land  ",
            7: "open_low_coh_water "
        },
        "description": "Flags indicating water detection results as string.",
    },
    "layover_impact": {
        "type": float,
        "null_value": 9.969209968386869e+36,
        "description": "Estimate of the height error caused by layover, which may not be reliable on a pixel by pixel basis, but may be useful to augment aggregated height uncertainties. ",
    },
    "height": {
        "type": float,
        "null_value": 9.96921e+36,
        "description": "Height of the pixel above the reference ellipsoid.",
    },
    "illumination_time": {
        "type": datetime,
        "null_value": 9.969209968386869e+36,
        "description": "Time of measurement in seconds in the UTC time scale since 1 Jan 2000 00:00:00 UTC. [tai_utc_difference] is the difference between TAI and UTC reference time (seconds) for the first measurement of the data set. If a leap second occurs within the data set, the attribute leap_second is set to the UTC time at which the leap second occurs. ",
    },
	"geolocation_qual": {
        "type": int,
        "null_value": 4294967295,
        "description": "Quality flag for the geolocation quantities in the pixel cloud data",
		"flag_masks": {
            "layover_significant": 1,
            "phase_noise_suspect": 2,
            "phase_unwrapping_suspect": 4,
            "model_dry_tropo_cor_suspect": 8,
            "model_wet_tropo_cor_suspect": 16,
            "iono_cor_gim_ka_suspect": 32,
            "xovercal_suspect": 64,
            "suspect_karin_telem": 1024,
            "medium_phase_suspect": 4096,
            "tvp_suspect": 8192,
            "sc_event_suspect": 16384,
            "small_karin_gap": 32768,
            "specular_ringing_degraded": 524288,
            "model_dry_tropo_cor_missing": 1048576,
            "model_wet_tropo_cor_missing": 2097152,
            "iono_cor_gim_ka_missing": 4194304,
            "xovercal_missing": 8388608,
            "geolocation_is_from_refloc": 16777216,
            "no_geolocation_bad": 33554432,
            "medium_phase_bad": 67108864,
            "tvp_bad": 134217728,
            "sc_event_bad": 268435456,
            "large_karin_gap": 536870912
        }
    }

}

---

In [25]:
# 1. Identify and open the first NetCDF file in the pixel cloud directory
pixel_cloud_dir = Path(config['water_mask_pixel_cloud_dir'])
file_list = list(pixel_cloud_dir.glob('*.nc'))
filepath = file_list[0]  # Take the first file found

final_df = pd.DataFrame()

for filepath in file_list:
    try:
        print(f"[INFO] Found file: {filepath}")

        # 2. Load the pixel_cloud dataset and select the last 10,000 entries into a polars DataFrame
        ds = xr.open_dataset(filepath, group='pixel_cloud')

        # 3. Load the bounding box from a GeoJSON file as a GeoDataFrame
        bbox_path = Path(config['data_dir']) / 'bbox.geojson'
        with open(bbox_path, 'r') as f:
            bbox_data = json.load(f)
        bbox_gdf = gpd.GeoDataFrame.from_features(bbox_data['features'])

        # keep only the columns we are interested in
        data_columns_selection = list(data_columns.keys())
        for column in data_columns_selection:
            if column not in ds.variables:
                print(f"[INFO] Column {column} not found in dataset.")
                data_columns_selection.remove(column)
        df = ds[data_columns_selection].to_dataframe()
        full_len = len(df)

        # keep only lat and lon in the bbox
        df = df[(df['latitude'] >= bbox_gdf.bounds.miny[0]) & (df['latitude'] <= bbox_gdf.bounds.maxy[0])]
        df = df[(df['longitude'] >= bbox_gdf.bounds.minx[0]) & (df['longitude'] <= bbox_gdf.bounds.maxx[0])]
        print(f"Dataset contains ({len(df)}/{full_len}) entries (Spatial filtering).")

        # create a new column with the classification as string
        df['classification_str'] = df['classification'].replace(data_columns['classification_str']['value_map'])

        # concatenate the dataframes
        final_df = pd.concat([final_df, pd.DataFrame(df)])
    
    except Exception as e:
        print(f"[ERROR] {e} - Skipping file {filepath}")
        continue



[INFO] Found file: data\Water mask pixel cloud\SWOT_L2_HR_PIXC_010_318_155L_20240205T083356_20240205T083407_PIC0_01.nc
[INFO] Column classification_str not found in dataset.
Dataset contains (2764726/4434973) entries (Spatial filtering).
[INFO] Found file: data\Water mask pixel cloud\SWOT_L2_HR_PIXC_010_318_156L_20240205T083406_20240205T083417_PIC0_01.nc
[INFO] Column classification_str not found in dataset.
Dataset contains (2730960/6563603) entries (Spatial filtering).
[INFO] Found file: data\Water mask pixel cloud\SWOT_L2_HR_PIXC_011_012_154R_20240215T065603_20240215T065614_PIC0_01.nc
[INFO] Column classification_str not found in dataset.
Dataset contains (1959/3277260) entries (Spatial filtering).
[INFO] Found file: data\Water mask pixel cloud\SWOT_L2_HR_PIXC_011_012_155R_20240215T065613_20240215T065624_PIC0_01.nc
[INFO] Column classification_str not found in dataset.
Dataset contains (80015/4178722) entries (Spatial filtering).
[INFO] Found file: data\Water mask pixel cloud\SWOT_L

In [26]:
# Filtering the dataset
pre_filter_len = len(final_df)

# only keep the rows where classification is not 1 (land)
final_df = final_df[final_df['classification'] == 4] # 4 is open water
# final_df = final_df[final_df['geolocation_qual'] < 4] # Value is taken from an example notebook by NASA (https://github.com/podaac/tutorials/blob/master/notebooks/datasets/SWOT_PIXC_PhaseUnwrap_localmachine.ipynb)

# Filter for SNR flag (Product Description P.13)
# measurements are only useful where the SNR is sufficiently high. The L2_HR_PIXC product is designed such that no KaRIn measurements are left out of the product simply because they fall outside of granule cross-track boundaries. However, data samples that are not reliable (e.g., due to insufficient SNR) are flagged.
# Pending implementation, may be already resolved with the geolocation_qual flag

print(f"Dataset contains ({len(final_df)}/{pre_filter_len}) entries after filtering.")
df = final_df


Dataset contains (15661/14068729) entries after filtering.


In [27]:
# convert to geodataframe
gdf = gpd.GeoDataFrame(df, geometry=gpd.points_from_xy(df.longitude, df.latitude))
gdf.crs = "EPSG:4326"

# print all gdf types
print(gdf.dtypes)

# Save the GeoDataFrame to a shapefile
output_dir = Path(config['output_dir'])
output_dir.mkdir(exist_ok=True)
output_file_name = output_dir / f'water_mask_{datetime.now().strftime("%Y%m%d%H%M%S")}'

# sloppy fix for index64 pandas issue
gdf["row_id"] = gdf.index + 1
gdf.reset_index(drop=True, inplace=True)
gdf.set_index("row_id", inplace = True)

# save to shp
gdf.to_file(output_file_name, driver='ESRI Shapefile')

classification               float32
layover_impact               float32
height                       float32
illumination_time     datetime64[ns]
geolocation_qual             float64
latitude                     float64
longitude                    float64
classification_str            object
geometry                    geometry
dtype: object


  gdf.to_file(output_file_name, driver='ESRI Shapefile')
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
