# Info

**Project**

Assessment of gap-filling techniques applied to satellite phytoplankton composition products for the Atlantic Ocean

**Credit**

**© Ehsan Mehdipour**, 2025. (ehsan.mehdipour@awi.de)

Alfred Wegener Insitute for Polar and Marine Research, Bremerhaven, Germany

This work is licensed under the **GNU General Public License v3.0 (GPL-3.0)**.

**Objective**

This code compute the degree of smoothing based on the difference of all the dataset present in original satellite data and the gap-filled dataset. The degree of smoothing (DoS) is not an independent validation.

**Dataset**

The dataset is accessable through Copernicus Marine Service with the following DOI:
https://doi.org/10.48670/moi-00280 and Dataset ID: cmems_obs-oc_glo_bgc-plankton_my_l3-multi-4km_P1D

# Setup and configuration

## Import modules

In [1]:
# Modules for data analysis
import os
os.environ["OMP_NUM_THREADS"] = "1"
import xarray as xr
import numpy as np
import pandas as pd
import dask

# Modules for data visualisation
import matplotlib.pyplot as plt
from matplotlib.colors import LogNorm, LinearSegmentedColormap
import matplotlib.dates as mdates
from matplotlib import gridspec
import cartopy.crs as ccrs

# Miscellaneous modules
from tqdm import tqdm

# Manual modules or parameters
from function import *
from params import *

# Turn off warning
import warnings
warnings.filterwarnings("ignore", category=RuntimeWarning) 
warnings.simplefilter("ignore", UserWarning)

## Dask setup for heavy computation and parallalization

In [2]:
## Creating SLURM Cluster
# cluster, client = dask_slurm_cluster(queue='smp', cores=16, scale=16)

## Creating Distributed Cluster
# client = dask_distributed_client(n_workers=8, threads_per_worker=None)

## Close the cluster after computation completed
# cluster.close()
# client.close()

# Only expedition duration (merged data)

In [2]:
ds_input = xr.open_dataset(
    os.path.join(params['output_dir'],'merged/ds_input.nc')).compute()
ds_input = np.log10(ds_input)

ds_dincae = xr.open_dataset(
    os.path.join(params['output_dir'],'merged/ds_dincae.nc')).compute()

ds_dineof = xr.open_dataset(
    os.path.join(params['output_dir'],'merged/ds_dineof.nc')).compute()

In [3]:
ds_diff_dincae = ds_dincae - ds_input
ds_diff_dineof = ds_dineof - ds_input

In [4]:
rmse_dincae = np.sqrt(((ds_diff_dincae)**2).mean())
rmse_dincae['Total'] = np.sqrt(((ds_diff_dincae.to_array('PFT'))**2).mean())
rmse_dincae = pd.DataFrame(rmse_dincae.to_pandas())

rmse_dineof = np.sqrt(((ds_diff_dineof)**2).mean())
rmse_dineof['Total'] = np.sqrt(((ds_diff_dineof.to_array('PFT'))**2).mean())
rmse_dineof = pd.DataFrame(rmse_dineof.to_pandas())

In [7]:
rmse_dincae,rmse_dineof

(               0
 CHL     0.035001
 DIATO   0.064440
 DINO    0.047174
 HAPTO   0.059187
 GREEN   0.055844
 PROKAR  0.053813
 Total   0.050136,
                0
 CHL     0.086693
 DIATO   0.119066
 DINO    0.091444
 HAPTO   0.115081
 GREEN   0.115276
 PROKAR  0.089267
 Total   0.100411)

# All time data (not merged data)

In [2]:
# CSV file detailing the boundries of regions of interest (ROI)
regions = pd.read_csv('data/regions.csv', index_col=0)

In [3]:


dincae_error_sum = []
dineof_error_sum = []
dincae_error_count = []
dineof_error_count = []

for region in tqdm(range(1,11)):
    
    ## Read development data for coordinate information
    ds_dev = xr.open_dataset(os.path.join(params['output_dir'], str(region), f'ds_pft_dev.nc'),chunks='auto')
    ds_dev = ds_dev[params['PFT']]
    time = ds_dev.time
    lat = ds_dev.lat
    lon = ds_dev.lon
    # ds_train = ds_train.sel(time=cloud_date_train.clouded_date.values).compute()
    ds_dev = rm_boundry(ds_dev) 
    
    ## Read test data or full data for DoS computation
    ds_test = xr.open_dataset(os.path.join(params['output_dir'], str(region), f'ds_pft.nc'), chunks='auto')
    ds_test = ds_test[params['PFT']]
    ds_test = rm_boundry(ds_test)
    
    ## Read DINCAE data
    experiment='final'
    ds_dincae = xr.open_dataset(os.path.join(params['output_dir'], str(region), 'DINCAE', str(experiment), 'ds_reconstructed.nc'), chunks='auto')
    ds_dincae = ds_dincae.assign_coords({'time':time})
    ds_dincae = ds_dincae[params['PFT']]
    # ds_dincae = ds_dincae.sel(time=cloud_date_train.clouded_date.values).compute()
    ds_dincae = rm_boundry(ds_dincae)
    
    experiment='final_6'
    ds_dineof = xr.open_mfdataset(os.path.join(params['output_dir'], str(region), 'DINEOF', str(experiment), 'ds_reconstructed_*.nc'))
    ds_dineof = ds_dineof.rename(
            {'dim003':'time','dim002':'lat','dim001':'lon'}
        ).assign_coords(
            {'time':time, 'lat':lat,'lon':lon}
        )

    ds_dineof = ds_dineof[params['PFT']]
    # ds_dineof = ds_dineof.sel(time=cloud_date_train.clouded_date.values).compute()
    ds_dineof = rm_boundry(ds_dineof)
    #--------------------------------------------------------------------------
    
    ## Remove outliers
    ds_dev = ds_dev.where((ds_dev<2)&(ds_dev>-3)&(np.isfinite(ds_dev)))
    ds_dincae = ds_dincae.where((ds_dincae<2)&(ds_dincae>-3)&(np.isfinite(ds_dincae)))
    ds_dineof = ds_dineof.where((ds_dineof<2)&(ds_dineof>-3)&(np.isfinite(ds_dineof)))

#     ds_dev = np.power(10,ds_dev)
#     ds_dincae = np.power(10,ds_dincae)
#     ds_dineof = np.power(10,ds_dineof)

    ## Compute DoS for DINCAE
    ds_dincae_error_2 = np.square(ds_dincae - ds_dev)
    ds_dincae_error_count = ds_dincae_error_2.count()
    ds_dincae_error_sum = ds_dincae_error_2.sum(skipna=True)
    
    dincae_error_sum.append(ds_dincae_error_sum)
    dincae_error_count.append(ds_dincae_error_count)
    
    ## Compute DoS for DINEOF
    ds_dineof_error_2 = np.square(ds_dineof - ds_dev)
    ds_dineof_error_count = ds_dineof_error_2.count()
    ds_dineof_error_sum = ds_dineof_error_2.sum(skipna=True)

    dineof_error_sum.append(ds_dineof_error_sum)
    dineof_error_count.append(ds_dineof_error_count)

100%|███████████████████████████████████████████| 10/10 [01:06<00:00,  6.66s/it]


In [4]:
## Attach all values together

dincae_error_sum = xr.concat(dincae_error_sum, dim='region')
dincae_error_count = xr.concat(dincae_error_count, dim='region')

dineof_error_sum = xr.concat(dineof_error_sum, dim='region')
dineof_error_count = xr.concat(dineof_error_count, dim='region')

In [6]:
## Parallel computation using Dask SLURM cluster

results = dask.compute(
    dincae_error_sum,
    dincae_error_count,
    dineof_error_sum,
    dineof_error_count
)

dincae_error_sum, dincae_error_count, dineof_error_sum, dineof_error_count = results

## Report values

In [15]:
## Degree of Smoothing (DoS) for DINCAE

DoS_DINCAE = np.sqrt(dincae_error_sum.sum()/dincae_error_count.sum())
DoS_DINCAE.round(2)

In [16]:
## Degree of Smoothing (DoS) for DINEOF

DoS_DINEOF = np.sqrt(dineof_error_sum.sum()/dineof_error_count.sum())
DoS_DINEOF.round(2)

In [18]:
## Ratio

(DoS_DINEOF / DoS_DINCAE).round(2)