In [None]:
import xarray as xr
import pandas as pd
import os
import numpy as np
import pyarrow.parquet as pq
import polars as pl

Open the PAR ncdfs into one xarray data object, with date as index

In [None]:
# Directory where your NetCDF files are stored (daily files)
ncdf_dir = "C:/Users/flapet/OneDrive - NOC/Documents/IDAPro/lib/db_building/data/satellite/par_mapped/"

# List all NetCDF files for the year (one file per day)
ncdf_files = sorted([f for f in os.listdir(ncdf_dir) if f.endswith('.nc')])

# Load the PAR data for each file and store it
daily_par_data = {}
for file in ncdf_files:
    # Open the dataset for the specific day
    ds = xr.open_dataset(os.path.join(ncdf_dir, file))
    
    # Extract PAR data (assuming it's named 'par', adjust if necessary)
    # Add a date key to use as the dictionary key for each day
    date = file.split('.')[1] # Extract the date from the filename (adjust based on filename format)
    
    # Store the data in a dictionary, with the date as the key
    daily_par_data[date] = ds["par"]  # Replace 'par' with the actual variable name in your files

# Example: inspect one of the datasets
print(daily_par_data["20240101"])  # Inspect the PAR data for January 1st, 2024


Open BGC Argo parquet and summarise to one line per profile

In [None]:
#argo = pq.read_table('C:/Users/flapet/OneDrive - NOC/Documents/IDAPro/lib/db_building/data/argo_pq/biocarbon_floats_table.parquet')
argo = pq.read_table('C:/Users/flapet/OneDrive - NOC/Documents/IDAPro/lib/db_building/data/argo_pq/biocarbon_floats_table.parquet')
obs_data = argo.to_pandas()

In [None]:
obs_data = obs_data[obs_data['JULD'] < pd.to_datetime("2025-01-01")]
obs_data = obs_data[obs_data['PRES'] < 201]

In [None]:
def mad_despike(series, threshold=3.5):
    """ Remove spikes using MAD method """
    median = np.nanmedian(series)
    mad = np.nanmedian(np.abs(series - median))
    modified_z_score = 0.6745 * (series - median) / mad if mad else np.zeros_like(series)
    return series[np.abs(modified_z_score) < threshold]


In [None]:
def despike_group(group):
    group["bbp700_cleaned"] = mad_despike(group["BBP700_ADJUSTED"])
    return group



In [None]:
obs_data = obs_data.groupby(["JULD"], group_keys=False).apply(despike_group)

In [None]:
from scipy.spatial import cKDTree

def get_nearest_par(lat, lon, date, daily_par_data):
    """Find the nearest PAR value for a given lat, lon, and date."""
    if date in daily_par_data and daily_par_data[date].size > 0:
        par_data = daily_par_data[date]
        nearest_par = par_data.sel(lon=lon, lat=lat, method='nearest').values.flatten()[0]
    else:
        nearest_par = 0
    return nearest_par

# Step 1: Extract unique position-time combinations
unique_obs = obs_data[["LATITUDE", "LONGITUDE", "JULD"]].drop_duplicates()

# Step 2: Compute PAR only for unique combinations
unique_obs["satellite_par"] = unique_obs.apply(
    lambda row: get_nearest_par(
        row["LATITUDE"], row["LONGITUDE"], row["JULD"].strftime('%Y%m%d'),
        daily_par_data
    ),
    axis=1
)

# Step 3: Merge results back to the full dataset
obs_data = obs_data.merge(unique_obs, on=["LATITUDE", "LONGITUDE", "JULD"], how="left")


Regularize data every meter

In [None]:
# Step 1: Round the `PRES` column to the nearest integer
obs_data['PRES_rounded'] = round(obs_data['PRES'], 0)

# Step 2: Group by the rounded `PRES` values
# Aggregate other columns by taking the mean for each group
df_regularized = obs_data

# Step 3: Drop the original `PRES` column if not needed
df_regularized = df_regularized.drop(columns=['PRES'])

df_regularized['year'] = df_regularized['JULD'].dt.year
df_regularized['month'] = df_regularized['JULD'].dt.month
df_regularized['day'] = df_regularized['JULD'].dt.day

Apply the primary production algorythm

In [None]:
df_regularized['BBP470'] = df_regularized['bbp700_cleaned']/(470/400) 
df_regularized['carbon'] = 12128 * df_regularized['BBP470'] + 0.59

In [None]:
full_depth = pd.merge(
    df_regularized[["JULD"]].drop_duplicates().assign(key=1),
    df_regularized[["PRES_rounded"]].drop_duplicates().assign(key=1),
    on="key"
).drop(columns="key")

# Step 2: Filter where PRES_ROUNDED < 201
full_depth = full_depth[full_depth["PRES_rounded"] < 201]

# Step 3: Left join with the original dataframe
full_depth = full_depth.merge(df_regularized, on=["JULD", "PRES_rounded"], how="left")

# Step 4: Group by (JULD, PRES_ROUNDED) and compute mean, ignoring NaNs
full_depth = (
    full_depth
    .groupby(["JULD", "PRES_rounded"], as_index=False)
    .agg(lambda x: np.nanmean(x) if np.issubdtype(x.dtype, np.number) else x.iloc[0])
)

In [None]:
def opp_befa(chl, irr, sst, dayL):
    if chl < 1.0:
        chl_tot = 38.0 * np.power(chl, 0.425)
    else:
        chl_tot = 40.2 * np.power(chl, 0.507)

    z_eu = 200.0 * np.power(chl_tot, (-0.293))

    if z_eu <= 102.0:
         z_eu = 568.2 * np.power(chl_tot, -0.746)

    if sst < -10.0:
        pb_opt = 0.0
    elif sst < -1.0:
        pb_opt = 1.13
    elif sst > 28.5:
        pb_opt = 4.0
    else:
        pb_opt = 1.2956 + 2.749e-1*sst + 6.17e-2*np.power(sst, 2) - \
            2.05e-2*np.power(sst, 3) + 2.462e-3*np.power(sst, 4) - \
            1.348e-4*np.power(sst, 5) + 3.4132e-6*np.power(sst, 6) - \
            3.27e-8*np.power(sst, 7)

    irrFunc = 0.66125 * irr / (irr + 4.1)

    npp = pb_opt * chl * dayL * irrFunc * z_eu

    return npp

def cal_dayL(lat, yDay):
    gamma = lat/180.0 * np.pi
    psi = yDay/365.0 * 2.0 * np.pi
    solarDec = (0.39637 - 22.9133*np.cos(psi) + 4.02543*np.sin(psi) - \
                0.38720*np.cos(2*psi) + 0.05200*np.sin(2*psi)) * np.pi/180.0
    r = -np.tan(gamma) * np.tan(solarDec)

    if r<=-1:
        return 24.0
    elif np.fabs(r)<1:
        return 24.0 * np.arccos(r) / np.pi
    else:
        return 0
    
def day_of_year(day, month, year=2024):
    return (datetime.date(year, month, day) - datetime.date(year, 1, 1)).days + 1

In [None]:
def running_mean(array, window_size=5):
    kernel = np.ones(window_size) / window_size
    smoothed_array = np.convolve(array, kernel, mode='same')  # 'same' ensures the output matches input size
    return smoothed_array

In [None]:
from cbpm_argo import cbpm_argo
import numpy as np
from scipy.interpolate import interp1d
import datetime

dfs = []
depth_grid = np.arange(0,200)

# Iterate through each unique 'JULD' (day)
for i in full_depth['JULD'].unique():
    # Filter for rows corresponding to the current 'JULD'
    temp_df = full_depth[full_depth['JULD'] == i].iloc[1:201,].copy()  # Use `.copy()` to avoid warnings

    # Extract the pressure and chlorophyll values for interpolation
    pres_values = temp_df['PRES_rounded'].to_numpy()
    chl_values = temp_df['CHLA_ADJUSTED'].to_numpy()
    carbon_values = temp_df['carbon'].to_numpy()
    temp_values = temp_df['TEMP'].to_numpy()

    # Apply the running mean smoothing
    chl_smoothed = running_mean(chl_values, window_size=5)
    carbon_smoothed = running_mean(carbon_values, window_size=5)

    # Check for valid data before interpolation (avoid NaN values)
    mask = ~np.isnan(pres_values) & ~np.isnan(chl_values)
    pres_values = pres_values[mask]
    chl_values = chl_values[mask]
    carbon_values = carbon_values[mask]
    temp_values = temp_values[mask]

    # Interpolate the CHLA_ADJUSTED onto the depth grid (0 to 199)
    if len(pres_values) > 1:  # Ensure there's enough data to interpolate
        interpolator = interp1d(pres_values, chl_values, bounds_error=False, fill_value=np.nan)
        interpolated_chl = interpolator(depth_grid)
        
        interpolator = interp1d(pres_values, carbon_values, bounds_error=False, fill_value=np.nan)
        interpolated_carbon = interpolator(depth_grid)

        interpolator = interp1d(pres_values, temp_values, bounds_error=False, fill_value=np.nan)
        interpolated_temp = interpolator(depth_grid)
        
    else:
        # If only one point or no valid data, fill with NaN
        print(temp_df['JULD'].unique())
        interpolated_chl = np.full(depth_grid.shape, np.nan)
        interpolated_carbon = np.full(depth_grid.shape, np.nan)

    # Now we can extract other values and apply the cbpm_argo function
    chl_z = interpolated_chl
    Cphyto_z = interpolated_carbon
    irr = temp_df['satellite_par'].mean()  # Mean irradiance value
    year = int(temp_df['year'].mean())
    month = int(temp_df['month'].mean())
    day = int(temp_df['day'].mean())
    lat = temp_df['LATITUDE'].mean()
    sst = interpolated_temp[0:5].mean()

    #calculation of daylength
    doy = day_of_year(day, month)
    day_length = cal_dayL(lat, doy)

    # Call the cbpm_argo function with interpolated data
    [pp_z, mu_z, par_z, prcnt_z, nutTempFunc_z, IgFunc_z, mzeu] = cbpm_argo(chl_z, Cphyto_z, irr, year, month, day, lat)

    #VGPM computation
    npp_vgpm = opp_befa(chl_z[0], irr, sst, day_length)

    size_max = len(temp_df)

    # Use .loc to explicitly assign new columns (expand results back into DataFrame)
    temp_df.loc[:, 'pp'] = pp_z[0:size_max]
    temp_df.loc[:, 'mu'] = mu_z[0:size_max]
    temp_df.loc[:, 'prcnt'] = prcnt_z[0:size_max]
    temp_df.loc[:, 'nutTempFunc'] = nutTempFunc_z[0:size_max]
    temp_df.loc[:, 'IgFunc'] = IgFunc_z[0:size_max]
    temp_df.loc[:, 'zeu'] = np.full(size_max, mzeu)
    temp_df.loc[:, 'npp_vgpm'] = np.full(size_max, npp_vgpm)

    # Append modified DataFrame to the list
    dfs.append(temp_df)


# Combine all DataFrames
final_df = pd.concat(dfs)


In [None]:
final_df.columns

In [None]:
final_df.to_csv('C:/Users/flapet/OneDrive - NOC/Documents/IDAPro/lib/db_building/data/argo_pp_estimations_floats_2.csv')

In [None]:
test = df_regularized[df_regularized["PLATFORM_NUMBER"] == "4903532 "]

In [None]:
test.to_csv('C:/Users/flapet/OneDrive - NOC/Documents/IDAPro/lib/db_building/data/problematic_float.csv')