# Subsetting STOFS-3D-Atl field2d.nc Files 
We are subsetting the forecast and nowcast data from files containing the current data for different time windows. We save subset data for different dates in separate folders. 

To begin, load the environment. `source /nhc/Atieh.Alipour/environment/miniconda3/bin/activate  env_subsetting` 

## Modifying Node Connectivity and Dimension Names in filed2d.nc Files 

The filed2d.nc files exhibit different node connectivity, possibly due to some masking of the original node connectivity files. To facilitate data subsetting using the Thalassa package, we need to make adjustments to the node connectivities and certain dimension names.

In [1]:
import dask
import geoviews as gv
import holoviews as hv
import numcodecs
import numpy as np
import pandas as pd
import shapely
import xarray as xr
import matplotlib.pyplot as plt
import s3fs  # Importing the s3fs library for accessing S3 buckets
import time  # Importing the time library for recording execution time
import shapely  # Importing shapely for geometric operations 
import thalassa  # Importing thalassa library for STOFS data analysis
from thalassa import api  # Importing thalassa API for data handling
from thalassa import normalization
from thalassa import utils
from holoviews import opts as hvopts
from holoviews import streams
from holoviews.streams import PointerXY
from holoviews.streams import Tap
import bokeh.plotting as bp

hv.extension("bokeh")


# Set some defaults for the visualization of the graphs
hvopts.defaults(
    hvopts.Image(
        width=800,
        height=600,
        show_title=True,
        tools=["hover"],
        active_tools=["pan", "box_zoom"],
        cmap="jet",
    ),
)

COMPRESSOR = numcodecs.Blosc(cname="zstd", clevel=1)

In [2]:
def read_netcdf_from_s3(bucket_name, key):
    """
    Function to read a NetCDF file from an S3 bucket using thalassa API.
    
    Parameters:
    - bucket_name: Name of the S3 bucket
    - key: Key/path to the NetCDF file in the bucket
    
    Returns:
    - ds: xarray Dataset containing the NetCDF data
    """
    s3 = s3fs.S3FileSystem(anon=True)  # Enable anonymous access to the S3 bucket
    url = f"s3://{bucket_name}/{key}"
    ds = xr.open_dataset(s3.open(url, 'rb'), drop_variables=['nvel'])  # Open NetCDF dataset and drop 'nvel' variable
    return ds

In [23]:
def normalize_data(ds, bucket_name, base_key, filename, date):
    """
    Function to modify/normalize a dataset using the Thalassa package.

    Parameters:
    - ds: xarray Dataset containing the data
    - bucket_name: Name of the S3 bucket
    - base_key: Base key for the dataset in the S3 bucket
    - filename: Original filename to be replaced
    - date: Date string for the new filename

    Returns:
    - normalized_ds: Thalassa dataset ready for cropping or visualizing
    """

    if 'element' in ds:
        normalized_ds = thalassa.normalize(ds)
    else:
        key = f'{base_key}/{filename}'
        ds_with_element_key = key.replace(filename, f'schout_adcirc_{date}.nc')
        ds_with_element = read_netcdf_from_s3(bucket_name, ds_with_element_key)  # Read NetCDF data from S3 bucket

        # Modify the field2d.nc file based on schout_adcirc.nc file
        ds = ds.rename({'nSCHISM_hgrid_node': 'node', 'nSCHISM_hgrid_face': 'nele', 'nMaxSCHISM_hgrid_face_nodes': 'nvertex'})
        ds = ds.drop_dims('nele')  # Drop nele variable if it exists
        ds['nele'] = ds_with_element['nele']
        ds['nvertex'] = ds_with_element['nvertex']
        ds['element'] = ds_with_element['element']
        ds = ds.rename({'SCHISM_hgrid_node_x': 'x', 'SCHISM_hgrid_node_y': 'y'})

        # Normalize data
        normalized_ds = thalassa.normalize(ds)

    return normalized_ds

In [4]:
def subset_thalassa(ds, box):
    """
    Function to subset a thalassa Dataset based on a bounding box using shapely.
    
    Parameters:
    - ds: thalassa Dataset containing the data
    - box: Tuple representing the bounding box (x_min, x_max, y_min, y_max)
    
    Returns:
    - new_ds: Subset of the input dataset within the specified bounding box
    """
    bbox = shapely.box(box[0], box[2], box[1], box[3])  # Create a shapely box from the bounding box coordinates
    new_ds = thalassa.crop(ds, bbox)  # Crop the dataset using the bounding box
    return new_ds


In [5]:

def save_subset_to_netcdf(xarray_ds, output_file):
    """
    Function to save a subset of an xarray Dataset to a NetCDF file.
    
    Parameters:
    - xarray_ds: Subset of the xarray Dataset
    - output_file: Path to save the output NetCDF file
    """
    xarray_ds.to_netcdf(output_file)  # Save the subset to a NetCDF file



In [21]:
# Testing Thalassa library

start_time = time.time()  # Record the start time

bucket_name = 'noaa-nos-stofs3d-pds'
#key = 'STOFS-3D-Atl/stofs_3d_atl.20240325/schout_adcirc_20240325.nc'
dates = ['20240325', '20240326', '20240327']
date= '20240325'
base_key = f'STOFS-3D-Atl/stofs_3d_atl.{date}'
filename = 'stofs_3d_atl.t12z.f001_024.field2d.nc'
key = f'{base_key}/{filename}'
dataset = read_netcdf_from_s3(bucket_name, key)  # Read NetCDF data from S3 bucket

end_time = time.time()  # Record the end time
execution_time = end_time - start_time  # Calculate execution time
print(f"Execution time for reading data: {execution_time} seconds")  # Print execution time


Execution time for reading data: 4.875180721282959 seconds


In [24]:
start_time = time.time()  # Record the start time

normalize_dataset = normalize_data(dataset, bucket_name, base_key, filename, date)
end_time = time.time()  # Record the end time
execution_time = end_time - start_time  # Calculate execution time
print(f"Execution time for normalizing: {execution_time} seconds")  # Print execution time


Execution time for normalizing: 5.376539945602417 seconds


In [25]:
start_time = time.time()  # Record the start time

# Define the bounding box
box = (-76.6, -76.4, 39.1, 39.3)

ds2 = subset_thalassa(normalize_dataset, box)  # Subset the thalassa dataset

end_time = time.time()  # Record the end time
execution_time = end_time - start_time  # Calculate execution time
print(f"Execution time for subsetting: {execution_time} seconds")  # Print execution time


Execution time for subsetting: 2.40216326713562 seconds


In [18]:
start_time = time.time()  # Record the start time
output_file = 'stofs3D_subset_test.nc'
save_subset_to_netcdf(ds2, output_file)  # Save the subset to a NetCDF file
end_time = time.time()  # Record the end time
execution_time = end_time - start_time  # Calculate execution time
print(f"Execution time for writing: {execution_time} seconds")  # Print execution time


Execution time for writing: 34.29424214363098 seconds


In [26]:
# Subset Data for Chris
# Surface Currents for a week 20240320 back to 20240330 (and whatever forecast)
# Bounding Box: 
#North: 39.3
#South: 39.1
#East: -76.15
#West: -76.65


import os
from datetime import datetime, timedelta

bucket_name = 'noaa-nos-stofs3d-pds'
start_date = datetime.strptime('20240320', '%Y%m%d')
end_date = datetime.strptime('20240331', '%Y%m%d')
dates = [(start_date + timedelta(days=i)).strftime('%Y%m%d') for i in range((end_date - start_date).days + 1)]

filenames = ['stofs_3d_atl.t12z.f001_024.field2d.nc', 'stofs_3d_atl.t12z.f025_048.field2d.nc', 'stofs_3d_atl.t12z.n001_024.field2d.nc']

for date in dates:
    print(date)
    base_key = f'STOFS-3D-Atl/stofs_3d_atl.{date}'
    for filename in filenames:
        key = f'{base_key}/{filename}'
        dataset = read_netcdf_from_s3(bucket_name, key)  # Read NetCDF data from S3 bucket
        normalize_dataset = normalize_data(dataset, bucket_name, base_key, filename, date)
        
        # Define the output directory
        output_dir = f'./{date}'
        os.makedirs(output_dir, exist_ok=True)  # Create the output directory if it doesn't exist
        
        # Define the bounding box
        box = (-76.65, -76.15, 39.1, 39.3)
        ds2 = subset_thalassa(normalize_dataset, box)  # Subset the Thalassa dataset
        
        output_file = f'{output_dir}/{filename}'
        save_subset_to_netcdf(ds2, output_file)  # Save the subset to a NetCDF file



20240320
20240321
20240322
20240323
20240324
20240325
20240326
20240327
20240328
20240329
20240330
20240331


In [27]:
# test one subset data
ds = xr.open_dataset('./20240328/stofs_3d_atl.t12z.f025_048.field2d.nc')
ds

In [29]:
timestamp = pd.Timestamp(ds.time[0].values)

thalassa.plot(
    ds=ds.sel(time=timestamp),  # or `.isel() etc
    variable="depth", 
    title=f"depth: {timestamp}"
)

In [30]:
# Subset Data for Rachael
# Surface Currents for a week May 2nd, 2023
# Bounding Box: 
#North: 39.26
#South: 39.14
#West: -76.61
#East: -76.43



import os
from datetime import datetime, timedelta

bucket_name = 'noaa-nos-stofs3d-pds'
start_date = datetime.strptime('20230429', '%Y%m%d')
end_date = datetime.strptime('20230506', '%Y%m%d')
dates = [(start_date + timedelta(days=i)).strftime('%Y%m%d') for i in range((end_date - start_date).days + 1)]

filenames = ['stofs_3d_atl.t12z.f001_024.field2d.nc', 'stofs_3d_atl.t12z.f025_048.field2d.nc', 'stofs_3d_atl.t12z.n001_024.field2d.nc']

for date in dates:
    print(date)
    base_key = f'STOFS-3D-Atl/stofs_3d_atl.{date}'
    for filename in filenames:
        key = f'{base_key}/{filename}'
        dataset = read_netcdf_from_s3(bucket_name, key)  # Read NetCDF data from S3 bucket
        normalize_dataset = normalize_data(dataset, bucket_name, base_key, filename, date)
        
        # Define the output directory
        output_dir = f'./{date}'
        os.makedirs(output_dir, exist_ok=True)  # Create the output directory if it doesn't exist
        
        # Define the bounding box
        box = (-76.61, -76.43, 39.14, 39.26)
        ds2 = subset_thalassa(normalize_dataset, box)  # Subset the Thalassa dataset
        
        output_file = f'{output_dir}/{filename}'
        save_subset_to_netcdf(ds2, output_file)  # Save the subset to a NetCDF file



20230429
20230430
20230501
20230502
20230503
20230504
20230505
20230506


In [None]:
# Subset Data for Rachael
# Surface Currents for a week Apr 1st, 2023
# Bounding Box: 
#North: 39.26
#South: 39.14
#West: -76.61
#East: -76.43



import os
from datetime import datetime, timedelta

bucket_name = 'noaa-nos-stofs3d-pds'
start_date = datetime.strptime('20230328', '%Y%m%d')
end_date = datetime.strptime('20230404', '%Y%m%d')
dates = [(start_date + timedelta(days=i)).strftime('%Y%m%d') for i in range((end_date - start_date).days + 1)]

filenames = ['stofs_3d_atl.t12z.f001_024.field2d.nc', 'stofs_3d_atl.t12z.f025_048.field2d.nc', 'stofs_3d_atl.t12z.n001_024.field2d.nc']

for date in dates:
    print(date)
    base_key = f'STOFS-3D-Atl/stofs_3d_atl.{date}'
    for filename in filenames:
        key = f'{base_key}/{filename}'
        dataset = read_netcdf_from_s3(bucket_name, key)  # Read NetCDF data from S3 bucket
        normalize_dataset = normalize_data(dataset, bucket_name, base_key, filename, date)
        
        # Define the output directory
        output_dir = f'./{date}'
        os.makedirs(output_dir, exist_ok=True)  # Create the output directory if it doesn't exist
        
        # Define the bounding box
        box = (-76.61, -76.43, 39.14, 39.26)
        ds2 = subset_thalassa(normalize_dataset, box)  # Subset the Thalassa dataset
        
        output_file = f'{output_dir}/{filename}'
        save_subset_to_netcdf(ds2, output_file)  # Save the subset to a NetCDF file


20230328
20230329
20230330
20230331
20230401
20230402
20230403
20230404


In [3]:
import xarray as xr
import thalassa  
import pandas as pd
from holoviews import opts as hvopts
import holoviews as hv


ds = xr.open_dataset('./20240328/stofs_3d_atl.t12z.f025_048.field2d.nc')
timestamp = pd.Timestamp(ds.time[0].values)

hv.extension("bokeh")

# Set some defaults for the visualization of the graphs
hvopts.defaults(
    hvopts.Image(
        width=800,
        height=600,
        show_title=True,
        tools=["hover"],
        active_tools=["pan", "box_zoom"],
        cmap="jet",
    ),
)


thalassa.plot(
    ds=ds.sel(time=timestamp),  # or `.isel() etc
    variable="uvel_surface",
    title=f"depth: {timestamp}"
)