In [None]:
# in the CW3E network, individual stations have been split into multiple netcdf files due to file size limitations when the initial cleaning step was done 
# Here, we take all netcdf files corresponding to the same station, and merge them into one file. 
import xarray as xr 
import s3fs
import pandas as pd 
from tqdm import tqdm
import boto3

In [None]:
bucket = "wecc-historical-wx"
CW3E_cleaned_folder = "2_clean_wx/CW3E"

In [None]:
# Read csv from s3
csv_filepath_s3 = "s3://wecc-historical-wx/2_clean_wx/temp_clean_all_station_list.csv"
stations_df = pd.read_csv(csv_filepath_s3)

# Filter the dataframe to only include rows corresponding to CW3E
# And, only cleaned stations
network_df = stations_df[
    (stations_df["network"] == "CW3E") & (stations_df["cleaned"] == "Y")
]

In [None]:
def get_filenames_in_s3_folder(bucket, folder):
    """Get a list of files in s3 bucket.
    Make sure you follow the naming rules exactly for the two function arguments.
    See example in the function docstrings for more details.

    Parameters
    ---------
    bucket : str
        Simply, the name of the bucket, with no slashes, prefixes, suffixes, etc...
    folder : str
        Folder within the bucket that you want the filenames from
        MAKE SURE folder doesn't have a trailing "/"
        i.e. it should be "[folder]", not "[folder]/"

    Returns
    -------
    files_in_s3 : list of str
        List of filenames in the bucket

    Example
    -------
    You want to get all the filenames in a s3 bucket with the following path:
    s3 URI: "s3://wecc-historical-wx/1_raw_wx/VALLEYWATER/"
    >>> get_filenames_in_s3_folder(
    >>>    bucket = "wecc-historical-wx",
    >>>    folder = "1_raw_wx/VALLEYWATER"
    >>> )
    ['ValleyWater_6001_1900-01-01_2024-11-11.csv','ValleyWater_6004_1900-01-01_2024-11-11.csv']

    References
    ----------
    [1] https://stackoverflow.com/questions/59225939/get-only-file-names-from-s3-bucket-folder
    """

    s3 = boto3.resource("s3")
    s3_bucket = s3.Bucket(bucket)

    # Get all the filenames
    # Just get relative path (f.key.split(folder + "/")[1])
    files_in_s3 = [
        f.key.split(folder + "/")[1]
        for f in s3_bucket.objects.filter(Prefix=folder).all()
    ]

    # Delete empty filenames
    # I think the "empty" filename/s is just the bucket path, which isn't a file but is read as an object by the objects.filter function
    files_in_s3 = [f for f in files_in_s3 if f != ""]

    return files_in_s3

In [None]:
cw3e_files_all = get_filenames_in_s3_folder(bucket, CW3E_cleaned_folder)
cw3e_nc_files = [file for file in cw3e_files_all if file.split(".")[1]=="nc"]

In [None]:
station_ids = network_df["era-id"].values
fs = s3fs.S3FileSystem()

print(f"Processing {len(station_ids)} stations for network: CW3E")
for station_id in tqdm(station_ids[:1]):
    # Get s3 filepaths for that station 
    filenames_in_s3 = [file for file in cw3e_nc_files if station_id in file]
    if len(filenames_in_s3) > 0: 
        filepaths_in_s3 = [f"s3://{bucket}/{CW3E_cleaned_folder}/{filename_in_s3}" for filename_in_s3 in filenames_in_s3]
    else: 
        print(f"No netcdfs found for station: {station_id}")
        continue # Skip to next loop iteration
    
    # Read in all the files for that station
    ds_list = []
    for filepath in filepaths_in_s3: 
        try: 
            with fs.open(filepath) as fileObj:
                # Now we use the open file handle with xarray, without closing it prematurely
                ds = xr.open_dataset(fileObj).load()
                ds_list.append(ds)
        except: 
            print("File {filepath} for station {station_id} could not be read in")
            continue # Skip to next loop iteration

    # Concat along time dimension 
    # And, sort by time so that its in chronological order (not default behavior of xr.concat)
    if len(ds_list) > 0: 
        station_ds = xr.concat(ds_list, dim="time").sortby("time")
    else: 
        continue # Skip to next loop iteration

    # Write to zarr 
    zarr_s3_path = f"s3://{bucket}/{CW3E_cleaned_folder}/{station_id}.zarr"
    try: 
        station_ds.to_zarr(
            zarr_s3_path,
            mode="w",
            consolidated=True,
        )
    except: 
        print("zarr for station {station_id} could not be successfully written to s3 bucket")
        continue # Skip to next loop iteration
