In [1]:
import xarray as xr
import pandas as pd
import os
import numpy as np
import pyarrow.parquet as pq
import polars as pl

In [2]:
test = xr.open_dataset("C:/Users/flapet/OneDrive - NOC/Documents/IDAPro/lib/db_building/data/satellite/par_mapped/AQUA_MODIS.20240101.L3m.DAY.PAR.x_par.nc")

Open the PAR ncdfs into one xarray data object, with date as index

In [3]:
# Directory where your NetCDF files are stored (daily files)
ncdf_dir = "C:/Users/flapet/OneDrive - NOC/Documents/IDAPro/lib/db_building/data/satellite/par_mapped/"

# List all NetCDF files for the year (one file per day)
ncdf_files = sorted([f for f in os.listdir(ncdf_dir) if f.endswith('.nc')])

# Load the PAR data for each file and store it
daily_par_data = {}
for file in ncdf_files:
    # Open the dataset for the specific day
    ds = xr.open_dataset(os.path.join(ncdf_dir, file))
    
    # Extract PAR data (assuming it's named 'par', adjust if necessary)
    # Add a date key to use as the dictionary key for each day
    date = file.split('.')[1] # Extract the date from the filename (adjust based on filename format)
    
    # Store the data in a dictionary, with the date as the key
    daily_par_data[date] = ds["par"]  # Replace 'par' with the actual variable name in your files

# Example: inspect one of the datasets
print(daily_par_data["20240101"])  # Inspect the PAR data for January 1st, 2024


<xarray.DataArray 'par' (lat: 715, lon: 1468)> Size: 4MB
[1049620 values with dtype=float32]
Coordinates:
  * lat      (lat) float32 3kB 70.03 69.99 69.95 69.9 ... 40.4 40.35 40.31 40.27
  * lon      (lon) float32 6kB -66.07 -66.03 -65.99 ... -5.024 -4.983 -4.941
Attributes:
    long_name:      Photosynthetically Available Radiation, R. Frouin
    units:          mol m^-2 day^-1
    standard_name:  surface_downwelling_photosynthetic_photon_flux_in_air
    valid_min:      -32750
    valid_max:      32250
    reference:      Frouin, R., Ligner, D.W., and Gautier, C., 1989: A Simple...
    display_scale:  linear
    display_min:    0.0
    display_max:    76.2


Open BGC Argo parquet and summarise to one line per profile

In [47]:
argo = pq.read_table('C:/Users/flapet/OneDrive - NOC/Documents/IDAPro/lib/db_building/data/argo_pq/biocarbon_floats_table.parquet')
obs_data = argo.to_pandas()

In [48]:
obs_data = obs_data[obs_data['JULD'] > pd.to_datetime('2024-01-01')]
obs_data = obs_data[obs_data['PRES'] <= 205]

In [49]:
from scipy.spatial import cKDTree

# Function to find the nearest PAR value for each observation
def get_nearest_par(lat, lon, date, daily_par_data):
    # Get the PAR data for the specific date
    par_data = daily_par_data[date]

    # Get the latitude and longitude arrays from the dataset
    # latitudes = par_data.lat.values
    # longitudes = par_data.lon.values

    # # Create a KDTree for efficient nearest-neighbor search
    # tree = cKDTree(list(zip(latitudes.flatten(), longitudes.flatten())))

    # # Query the tree for the nearest neighbor to the observation point
    # _, index = tree.query((lat, lon))  # Find the index of the nearest point

    # Extract the PAR value at the nearest point
    nearest_par = par_data.sel(lon = lon, lat = lat, method = 'nearest').values.flatten()[0]
    
    return nearest_par

# Apply the function to each row in the observation data
obs_data["satellite_par"] = obs_data.apply(
    lambda row: get_nearest_par(row["LATITUDE"], row["LONGITUDE"], row["JULD"].strftime('%Y%m%d'), daily_par_data),
    axis=1
)


Regularize data every meter

In [58]:
# Step 1: Round the `PRES` column to the nearest integer
obs_data['PRES_rounded'] = obs_data['PRES'].round()

# Step 2: Group by the rounded `PRES` values
# Aggregate other columns by taking the mean for each group
df_regularized = obs_data.groupby(['PLATFORM_NUMBER', 'JULD', 'PRES_rounded'], as_index=False).mean()

# Step 3: Drop the original `PRES` column if not needed
df_regularized = df_regularized.drop(columns=['PRES'])


In [59]:
df_regularized['year'] = df_regularized['JULD'].dt.year
df_regularized['month'] = df_regularized['JULD'].dt.month
df_regularized['day'] = df_regularized['JULD'].dt.day

Apply the primary production algorythm

In [60]:
df_regularized['BBP470'] = df_regularized['BBP700_ADJUSTED']/(470/400) 
df_regularized['carbon'] = 12128 * df_regularized['BBP470'] + 0.59

In [79]:
from cbpm_argo import cbpm_argo
import numpy as np
from scipy.interpolate import interp1d

dfs = []
depth_grid = np.arange(0,200)

# Iterate through each unique 'JULD' (day)
for i in df_regularized['JULD'].unique():
    # Filter for rows corresponding to the current 'JULD'
    temp_df = df_regularized[df_regularized['JULD'] == i].iloc[0:200,].copy()  # Use `.copy()` to avoid warnings

    # Extract the pressure and chlorophyll values for interpolation
    pres_values = temp_df['PRES_rounded'].to_numpy()
    chl_values = temp_df['CHLA_ADJUSTED'].to_numpy()
    carbon_values = temp_df['carbon'].to_numpy()

    # Check for valid data before interpolation (avoid NaN values)
    mask = ~np.isnan(pres_values) & ~np.isnan(chl_values)
    pres_values = pres_values[mask]
    chl_values = chl_values[mask]
    carbon_values = carbon_values[mask]

    # Interpolate the CHLA_ADJUSTED onto the depth grid (0 to 199)
    if len(pres_values) > 1:  # Ensure there's enough data to interpolate
        interpolator = interp1d(pres_values, chl_values, bounds_error=False, fill_value=np.nan)
        interpolated_chl = interpolator(depth_grid)
        
        interpolator = interp1d(pres_values, carbon_values, bounds_error=False, fill_value=np.nan)
        interpolated_carbon = interpolator(depth_grid)
        
    else:
        # If only one point or no valid data, fill with NaN
        interpolated_chl = np.full(depth_grid.shape, np.nan)
        interpolated_carbon = np.full(depth_grid.shape, np.nan)

    # Now we can extract other values and apply the cbpm_argo function
    chl_z = interpolated_chl
    Cphyto_z = interpolated_carbon
    irr = temp_df['satellite_par'].mean()  # Mean irradiance value
    year = int(temp_df['year'].mean())
    month = int(temp_df['month'].mean())
    day = int(temp_df['day'].mean())
    lat = temp_df['LATITUDE'].mean()

    # Call the cbpm_argo function with interpolated data
    [pp_z, mu_z, par_z, prcnt_z, nutTempFunc_z, IgFunc_z] = cbpm_argo(chl_z, Cphyto_z, irr, year, month, day, lat)

    size_max = len(temp_df)

    # Use .loc to explicitly assign new columns (expand results back into DataFrame)
    temp_df.loc[:, 'pp'] = pp_z[0:size_max]
    temp_df.loc[:, 'mu'] = mu_z[0:size_max]
    temp_df.loc[:, 'prcnt'] = prcnt_z[0:size_max]
    temp_df.loc[:, 'nutTempFunc'] = nutTempFunc_z[0:size_max]
    temp_df.loc[:, 'IgFunc'] = IgFunc_z[0:size_max]

    # Append modified DataFrame to the list
    dfs.append(temp_df)


# Combine all DataFrames
final_df = pd.concat(dfs)


  kbio = X[n] * pow(chl_z[z-1],e[n]);   # after Morel and Maritorena (2001)
  kbio = X[n] * pow(chl_z[z-1],e[n]);   # after Morel and Maritorena (2001)
  kbio = X[n] * pow(chl_z[z-1],e[n]);   # after Morel and Maritorena (2001)
  kbio = X[n] * pow(chl_z[z-1],e[n]);   # after Morel and Maritorena (2001)
  kbio = X[n] * pow(chl_z[z-1],e[n]);   # after Morel and Maritorena (2001)
  kbio = X[n] * pow(chl_z[z-1],e[n]);   # after Morel and Maritorena (2001)
  kbio = X[n] * pow(chl_z[z-1],e[n]);   # after Morel and Maritorena (2001)
  kbio = X[n] * pow(chl_z[z-1],e[n]);   # after Morel and Maritorena (2001)
  kbio = X[n] * pow(chl_z[z-1],e[n]);   # after Morel and Maritorena (2001)
  kbio = X[n] * pow(chl_z[z-1],e[n]);   # after Morel and Maritorena (2001)
  kbio = X[n] * pow(chl_z[z-1],e[n]);   # after Morel and Maritorena (2001)
  kbio = X[n] * pow(chl_z[z-1],e[n]);   # after Morel and Maritorena (2001)
  kbio = X[n] * pow(chl_z[z-1],e[n]);   # after Morel and Maritorena (2001)
  kbio = X[n

In [80]:
final_df.to_csv('C:/Users/flapet/OneDrive - NOC/Documents/IDAPro/lib/db_building/data/argo_pp_estimations.csv')