In [2]:
import pandas as pd
import numpy as np
import time
import netCDF4 as nc
import datetime

In [3]:
# There are several data points on the same day.
# We assume that data is uniformly distributed across the day.
# We will add a small time increment to each data point to make them unique.

def add_time_stamps(df):
    # Resample dates with duplicate indices by adding hours and minutes
    # Find duplicate dates in index
    duplicate_dates = df.index[df.index.duplicated(keep=False)]
    
    for date in set(duplicate_dates):
        duplicates = df.loc[date]
        # Calculate time increment based on the number of duplicates
        num_duplicates = len(duplicates)
        time_increment = datetime.timedelta(days=1) / num_duplicates

        # Assign new unique timestamps to each duplicate
        times = [date + i * time_increment for i in range(len(duplicates))]
        df.loc[date, 'new_index'] = times
    
    # Set the new unique datetime index
    df.set_index('new_index', inplace=True)
    df.index.name = 'datetime'  # Rename the index for clarity
    
    return df

In [4]:
def load_and_save_CMEMS_data(obs_dir,platform_name):

    all_wl_list = []
    longitudes = []
    latitudes = []
    for o in obs_dir:
        ds = nc.Dataset(o)
        formatted_time = []
        start = datetime.date(1950,1,1)
        for days in ds["TIME"][:].data:
            formatted_time.append(start + datetime.timedelta(days = days))

        data = ds["SLEV"][:].data.squeeze().tolist()
        lon = np.nanmean(ds["LONGITUDE"][:].data)
        lat = np.nanmean(ds["LATITUDE"][:].data)
        longitudes.append(lon)
        latitudes.append(lat) 

        wl = pd.DataFrame({"slev": data}, index = pd.DatetimeIndex(formatted_time))
        quality_filter = ds["SLEV_QC"][:].data<=2
        wl = wl[quality_filter]
        wl = wl.drop(wl[wl['slev'] < -100].index)# Remove invalid values
        wl["slev"] = wl["slev"]-np.mean(wl["slev"])# Center around zero

        #  Add timestamps
        wl = add_time_stamps(wl)
        
        all_wl_list.append(wl)

    all_wl=pd.concat(all_wl_list)
    # Replace missing entries with nan such that we have a full date time index
    all_wl = all_wl.set_index(all_wl.index).resample('10min').sum().replace(0.00, np.nan) 
    # Rolling mean value
    all_wl_rollmean = all_wl.rolling(window='30min',closed="left").mean()[::3]
    all_wl_rollmean = np.round(all_wl_rollmean.dropna(),decimals=4)
    all_wl_rollmean.index.name = "datetime_UTC"
    all_wl_rollmean.columns = ["water_level"]
    all_wl_rollmean.to_csv(f"../observations/{platform_name}_wl.csv")

    locations = pd.DataFrame({"station": [platform_name],
                              "longitude": [np.round(np.mean(longitudes),decimals=6)],
                              "latitude": [np.round(np.mean(latitudes),decimals=6)]})
    locations_all = pd.read_csv("../observations/stations.csv",index_col=0)
    locations_all = pd.concat([locations_all,locations])
    locations_all.to_csv("../observations/stations.csv")

    return all_wl_rollmean,all_wl

List of remaining platforms:
- [] DB
- [] MO
- [] IJmondstroompaal2      GOOD

# DB

In [8]:
# The modelskill package can be used to compare model results with observations.
# For more info on modelskill, see https://github.com/DHI/modelskill
obs_fldr = "raw_data/CMEMS/"
# Collect observation directories in list
obs_dir = [f"{obs_fldr}GL_TS_DB_6301615.nc"]

ds = nc.Dataset(obs_dir[0])
ds.variables.keys()

dict_keys(['TIME', 'TIME_QC', 'LATITUDE', 'LONGITUDE', 'POSITION_QC', 'DC_REFERENCE', 'TRAJECTORY', 'DEPH', 'TEMP', 'TEMP_QC', 'EWCT', 'EWCT_QC', 'NSCT', 'NSCT_QC'])

In [9]:
# u current
ds["EWCT"]

<class 'netCDF4.Variable'>
float32 EWCT(TIME, DEPTH)
    _FillValue: 9.96921e+36
    long_name: West-east current component
    standard_name: eastward_sea_water_velocity
    units: m s-1
    coordinates: TIME LATITUDE LONGITUDE DEPH TRAJECTORY
    data_mode: R
    ancillary_variables: EWCT_QC
unlimited dimensions: 
current shape = (77920, 2)
filling on

In [10]:
ds["NSCT"]

<class 'netCDF4.Variable'>
float32 NSCT(TIME, DEPTH)
    _FillValue: 9.96921e+36
    long_name: South-north current component
    standard_name: northward_sea_water_velocity
    units: m s-1
    coordinates: TIME LATITUDE LONGITUDE DEPH TRAJECTORY
    data_mode: R
    ancillary_variables: NSCT_QC
unlimited dimensions: 
current shape = (77920, 2)
filling on

# MO

In [11]:

obs_fldr = "raw_data/CMEMS/"
# Collect observation directories in list
obs_dir = [f"{obs_fldr}NO_TS_MO_6201065.nc"]

ds = nc.Dataset(obs_dir[0])
ds.variables.keys()

dict_keys(['TIME', 'TIME_QC', 'DEPH', 'LATITUDE', 'LONGITUDE', 'STATION', 'OSAT', 'OSAT_QC', 'OSAT_DM', 'TEMP', 'TEMP_QC', 'TEMP_DM', 'HCSP', 'HCSP_QC', 'HCDT', 'HCDT_QC', 'PSAL', 'PSAL_QC', 'PSAL_DM'])

In [12]:
ds["HCSP"]

<class 'netCDF4.Variable'>
float32 HCSP(TIME, DEPTH)
    standard_name: sea_water_speed
    units: m s-1
    _FillValue: 9.96921e+36
    long_name: Horizontal current speed
    valid_min: 0.001
    valid_max: 1.5
    coordinates: TIME LATITUDE LONGITUDE DEPH STATION
    ancillary_variables: HCSP_QC
    data_mode: R
unlimited dimensions: 
current shape = (1193772, 30)
filling off

In [13]:
ds["HCDT"]

<class 'netCDF4.Variable'>
float32 HCDT(TIME, DEPTH)
    standard_name: direction_of_sea_water_velocity
    units: degree
    _FillValue: 9.96921e+36
    long_name: Current to direction relative true north
    valid_min: 0.0
    valid_max: 360.0
    coordinates: TIME LATITUDE LONGITUDE DEPH STATION
    ancillary_variables: HCDT_QC
    data_mode: R
unlimited dimensions: 
current shape = (1193772, 30)
filling off

# IJmondstroompaal2

In [14]:
obs_fldr = "raw_data/CMEMS/"
# Collect observation directories in list
obs_dir = [f"{obs_fldr}NO_TS_MO_IJmondstroompaal2.nc"]

ds = nc.Dataset(obs_dir[0])
ds.variables.keys()

dict_keys(['TIME', 'TIME_QC', 'DEPH', 'LATITUDE', 'LONGITUDE', 'STATION', 'HCSP', 'HCSP_QC', 'HCDT', 'HCDT_QC'])