# Info

**Project**

Assessment of gap-filling techniques applied to satellite phytoplankton composition products for the Atlantic Ocean

**Credit**

**© Ehsan Mehdipour**, 2025. (ehsan.mehdipour@awi.de)

Alfred Wegener Insitute for Polar and Marine Research, Bremerhaven, Germany

This work is licensed under the **GNU General Public License v3.0 (GPL-3.0)**. 

**Objective**

This code read the satellite products and in-situ measurement and extract the matchup between the two dataset using the criteria suggested by Bailey and Werdell (2006) and EUMETSAT (2022).

**Dataset**

The dataset is accessable through Copernicus Marine Service with the following DOI:
https://doi.org/10.48670/moi-00280 and Dataset ID: cmems_obs-oc_glo_bgc-plankton_my_l3-multi-4km_P1D

# Setup and configuration

## Import modules

In [1]:
# Modules for data analysis
import os
os.environ["OMP_NUM_THREADS"] = "1"
import xarray as xr
import numpy as np
import pandas as pd
import dask

# Modules for data visualisation 
import matplotlib.pyplot as plt
from matplotlib.colors import LogNorm, LinearSegmentedColormap
import matplotlib.dates as mdates
from matplotlib import gridspec
import cartopy.crs as ccrs

# Miscellaneous modules
from tqdm import tqdm
import warnings
warnings.filterwarnings("ignore", category=RuntimeWarning) 
warnings.simplefilter("ignore", UserWarning)

# Manual modules or parameters
from function import *
from params import *

# Functions

In [None]:
## Loading HPLC dataset

def load_HPLC(fname, params):
    ''' 
    This function loads the High-performance liquid chromatography (HPLC) data from a csv file.
    
    Parameters:
    fname (int): file name and directory of the HPLC csv file
    params (dict): Dictionary of parameters
    
    Returns:
    HPLC (xr.Dataset): HPLC dataset containing the Chla concentrations of 
        PFTs and TChla for specific (PS113) expedition
    '''
    
    ## Read the csv file
    HPLC_df = pd.read_csv(fname ,sep='\t',parse_dates=[['Date','Time_UTC']],index_col=0)
    ## Drop unnecessary columns
    HPLC_df = HPLC_df.drop(columns=['Proben ID','Event_label','jultime','depth','Cryptophytes','Chrysophytes'])
    ## Sort accordingly
    HPLC_df = HPLC_df.rename_axis('time').sort_index()
    ## rename the dimensions to make it similar with the other datasets
    HPLC_df = HPLC_df.rename(columns={'Lat':'lat','Lon':'lon'})
    ## rename the PFT name columns to the abbreviations
    HPLC_df = HPLC_df.rename(columns=params['PFT_HPLC_dict'])
    ## Compute the PROKAR with summation of cyano_noprochlo and prochlo
    HPLC_df['PROKAR'] = HPLC_df['PROKAR'] + HPLC_df['PROCHLO']
    ## drop the prochlo column
    HPLC_df = HPLC_df.drop(columns=['PROCHLO'])
    ## convert to xarray dataset
    HPLC = HPLC_df.to_xarray().set_coords(['lat','lon'])
    ## assign unit and longname for ploting
    for var_name in HPLC.data_vars:
        HPLC[var_name] = HPLC[var_name].assign_attrs(units=params['units'])
        HPLC[var_name] = HPLC[var_name].assign_attrs(long_name=params['PFT_longname'][var_name])
    ## convert the time to daily values for matchup process
    HPLC['time'] = HPLC.time.astype('datetime64[D]')
    ## limit the Chla concentrations to minimum of 0.005
    HPLC = HPLC.where(HPLC >=0.005,0.005)
    ## convert to logarithmic scale
    HPLC = np.log10(HPLC)
    ## remove the NaN data
    HPLC = HPLC.where(np.isfinite(HPLC))
    return HPLC

In [None]:
def matchup(sat, ins):
    ''' 
    This function finds and extracts the satellite 3by3 matchups for each in-situ measurements 
    and perform the matchup extraction protocol. 
    Finally it will give the mean value as the final satellite matchup value.   
    
    Parameters:
    sat (xr.Dataset): Dataset containing daily satellite dataset with 
        "time, lat and lon" dimensions
    ins (xr.Dataset): Dataset cintaining in-situ measurements with "time" dimension and
        "lat and lon" coordinates
    
    Returns:
    sat_points (xr.Dataset): Dataset containing all the matchup values for satellite observations
    ins_points (xr.Dataset): Dataset containing all the matchup values for in-situ measurements
    '''
    
    ## Create the list for the matchups
    sat_points_list = []
    ins_points_list = []

    ## Looping through the in-situ measurements
    for i in tqdm(range(len(ins.time.values))):
        
        ## Extract the values from both instruments
        ins_point = ins.isel(time=i,drop=False)
        sat_point = sat.sel(time=ins_point.time.values)
        if sat_point.time.values != ins_point.time.values:
            continue
        sat_point = sat_point.sel(lat=ins_point.lat.values,lon=ins_point.lon.values, method='nearest')
        
        ## Get the index of the middle pixel
        lat_center = (sat.lat==sat_point.lat).argmax().values
        lon_center = (sat.lon==sat_point.lon).argmax().values
        
        ## Pixels with deviations exceeding ±1.5 times the standard deviation are removed as outliers.
        
        ## Extract 3by3 values
        s33 = sat.sel(time=ins_point.time.values).isel(lat = slice(lat_center-1,lat_center+2),lon= slice(lon_center-1,lon_center+2))
        
        # ## Pixels with deviations exceeding ±1.5 times the standard deviation are removed as outliers.
        s33_std = s33.std(ddof=1)
        s33_mean = s33.mean()
        s33_valid = s33.where((s33>(s33_mean - s33_std * 1.5))&
                              (s33<(s33_mean + s33_std * 1.5)))
        
        s33 = s33.where(s33_valid)
        
        ## Compute the coefficient of variation (CV) for valid points
        cv = abs(s33.std(ddof=1)/s33.mean()).where(s33.count()>=5)
        s33_mean = s33.mean().where(cv<0.2)
        
        ## Assign the middle pixel lat and lon to the matchups
        s33_mean = s33_mean.assign_coords({'lat':sat_point.lat})
        s33_mean = s33_mean.assign_coords({'lon':sat_point.lon})

        ## Append to the list
        ins_points_list.append(ins_point)
        sat_points_list.append(s33_mean)

    ## Concatenate all the values into xarray Dataset
    ins_points = xr.concat(ins_points_list, dim='time')
    sat_points = xr.concat(sat_points_list, dim='time')
    ins_points_list = None
    sat_points_list = None
    
    return sat_points, ins_points

In [None]:
def RMSE(sat_points, ins_points):
    '''
    Computing the root-mean-squared-error (RMSE) for the difference of satellite and in-situ measurements.
    
    Parameters:
    sat_points (xr.Dataset): Dataset containing all the matchup values for satellite observations
    ins_points (xr.Dataset): Dataset containing all the matchup values for in-situ measurements
    
    Returns:
    RMSE_df (pd.Dataframe): Dataframe stating the RMSE for PFTs and TChla
    
    '''
    error = (sat_points - ins_points)
    RMSE = np.sqrt(np.mean(np.square(error)))
    RMSE['Total'] = np.sqrt(np.mean(np.square(error.to_array())))
    RMSE_df = RMSE.to_array('PFT').rename('RMSE').to_dataframe()
    RMSE_df.loc['N. Matchups'] =  error.to_array().count().values.astype('int')
    return RMSE_df

In [None]:
def validation(sat, insitu):
    '''
    Conducting a initial validation procedure for extracting matchup points and computing the RMSE
    between satellite and in-situ measuremetns.
    
    Parameters:
    sat (xr.Dataset): Dataset containing all satellite observations of the study time and region
    insitu (xr.Dataset): Dataset containg all in-situ measurements of the study time and region
    
    Returns:
    sat_points (xr.Dataset): Dataset containing all the matchup values for satellite observations
    ins_points (xr.Dataset): Dataset containing all the matchup values for in-situ measurements
    RMSE_df (pd.Dataframe): Dataframe stating the RMSE for PFTs and TChla
    
    '''
    sat_points, ins_points = matchup(sat, insitu)
    RMSE_df = RMSE(sat_points, ins_points)
    
    return RMSE_df, sat_points, ins_points

# Call

## Load data

In [None]:
## Insitu measurement HPLC data
HPLC = load_HPLC(fname='path/to/HPLC/data', params=params)

In [None]:
## Original satellite dataset
ds_input = xr.open_dataset(os.path.join(params['output_dir'],'merged/ds_input.nc')).compute()
ds_input = np.log10(ds_input)

In [None]:
## DINCAE gap-filled satellite dataset
ds_dincae = xr.open_dataset(os.path.join(params['output_dir'],'merged/ds_dincae.nc')).compute()

In [None]:
## DINEOF gap-filled satellite dataset
ds_dineof = xr.open_dataset(os.path.join(params['output_dir'],'merged/ds_dineof.nc')).compute()

## Original satellite data matchups

In [None]:
## Name of the two insurument to use as extension for saving the data and retrieving 
## the matchups later on for statistical and visualisation purposes
instruments = ['Input','HPLC']

## Calling the matchup extraction and initial RMSE.
RMSE_df_input, sat_points_input, ins_points_input = validation(sat=ds_input, insitu=HPLC)
matchup_values_input = xr.concat((sat_points_input,ins_points_input), dim='instrument').assign_coords({'instrument':instruments})

## Saving the outputs

# matchup_dir = os.path.join(params['output_dir'],'merged/matchups')
# RMSE_df.to_csv(os.path.join(matchup_dir, f'RMSE_{instruments[0]}_{instruments[1]}.csv'))
# matchup_values.to_netcdf(os.path.join(matchup_dir, f'matchups_{instruments[0]}_{instruments[1]}.nc'))

## Filled and Transferred matchups DINCAE
**Transferred**: matchups that were also presented in the original satellite dataset

**Filled**: matchups that were filled and created with gap-filled of the satellite dataset

In [None]:
## Extracing all the matchups for DINCAE

instruments = ['DINCAE','HPLC']

RMSE_df_dincae, sat_points_dincae, ins_points_dincae = validation(sat=ds_dincae, insitu=HPLC)
matchup_values_dincae = xr.concat((sat_points_dincae,ins_points_dincae), dim='instrument').assign_coords({'instrument':instruments})

## Saving matchups
# matchup_dir = os.path.join(params['output_dir'],'merged/matchups')
# RMSE_df.to_csv(os.path.join(matchup_dir, f'RMSE_{instruments[0]}_{instruments[1]}.csv'))
# matchup_values.to_netcdf(os.path.join(matchup_dir, f'matchups_{instruments[0]}_{instruments[1]}.nc'))

In [None]:
## Separating and extracting matchups for DINCAE transferred and filled matchups

sat_points_dincae_transferred = sat_points_dincae.where(np.isfinite(sat_points_input))
matchup_values_dincae_transferred = xr.concat((sat_points_dincae_transferred,ins_points_dincae), dim='instrument').assign_coords({'instrument':instruments})

sat_points_dincae_filled = sat_points_dincae.where(np.isnan(sat_points_input))
matchup_values_dincae_filled = xr.concat((sat_points_dincae_filled,ins_points_dincae), dim='instrument').assign_coords({'instrument':instruments})

## Saving matchups
# matchup_dir = os.path.join(params['output_dir'],'merged/matchups')
# matchup_values_dincae_transferred.to_netcdf(os.path.join(matchup_dir, f'matchups_{instruments[0]}_{instruments[1]}_transferred.nc'))
# matchup_values_dincae_filled.to_netcdf(os.path.join(matchup_dir, f'matchups_{instruments[0]}_{instruments[1]}_filled.nc'))

## Filled and Transferred matchups DINEOF
**Transferred**: matchups that were also presented in the original satellite dataset

**Filled**: matchups that were filled and created with gap-filled of the satellite dataset

In [None]:
## Extracing all the matchups for DINEOF

instruments = ['DINEOF','HPLC']

RMSE_df_dineof, sat_points_dineof, ins_points_dineof = validation(sat=ds_dineof, insitu=HPLC)
matchup_values_dineof = xr.concat((sat_points_dineof,ins_points_dineof), dim='instrument').assign_coords({'instrument':instruments})

## Saving matchups
# matchup_dir = os.path.join(params['output_dir'],'merged/matchups')
# RMSE_df_dineof.to_csv(os.path.join(matchup_dir, f'RMSE_{instruments[0]}_{instruments[1]}.csv'))
# matchup_values_dineof.to_netcdf(os.path.join(matchup_dir, f'matchups_{instruments[0]}_{instruments[1]}.nc'))

In [None]:
## Separating and extracting matchups for DINEOF transferred and filled matchups

sat_points_dineof_transferred = sat_points_dineof.where(np.isfinite(sat_points_input))
matchup_values_dineof_transferred = xr.concat((sat_points_dineof_transferred,ins_points_dineof), dim='instrument').assign_coords({'instrument':instruments})

sat_points_dineof_filled = sat_points_dineof.where(np.isnan(sat_points_input))
matchup_values_dineof_filled = xr.concat((sat_points_dineof_filled,ins_points_dineof), dim='instrument').assign_coords({'instrument':instruments})

## Saving matchups
# matchup_dir = os.path.join(params['output_dir'],'merged/matchups')
# RMSE_df.to_csv(os.path.join(matchup_dir, f'RMSE_{instruments[0]}_{instruments[1]}.csv'))
matchup_values_dineof_transferred.to_netcdf(os.path.join(matchup_dir, f'matchups_{instruments[0]}_{instruments[1]}_transferred.nc'))
matchup_values_dineof_filled.to_netcdf(os.path.join(matchup_dir, f'matchups_{instruments[0]}_{instruments[1]}_filled.nc'))