# Compute a NC Databse for OG1 gliders data 

In [1]:
import sys
sys.path.append('C:/Users/flapet/OneDrive - NOC/Documents/utils_python')
from functions.profiling import *
from functions.general import *
import numpy as np
import pandas as pd
import os
from tqdm import tqdm

#Set relative path
os.chdir("C:/Users/flapet/OneDrive - NOC/Documents/IDAPro/lib/db_building/")

dataset_dir = "data/glider/OG1/"

In [2]:
filename = dataset_dir + 'Cabot_645_R.nc'
prof = find_profiles_by_depth(filename)
dat = xr.open_dataset(filename)

100%|██████████| 1090/1090 [00:00<00:00, 1211.84it/s]


In [7]:
#Return a list of the profiles length
row_lengths = [len(arr) for arr in prof]
#Create an array of arrays filled with the number of the profiles
profile_list = np.array([np.full((length,), idx + 1) for idx, length in enumerate(row_lengths)], dtype = object)
#Flatten it into a 1D array
profile_list = np.concatenate(profile_list, axis = None)

#Go back to the profiles index to flatten it and select xarray dataset points that match a profile point
prof_flat = np.concatenate(prof, axis = None)

nc_profiles = dat.sel(N_MEASUREMENTS = prof_flat)

In [8]:
nc_profiles = nc_profiles.assign(profile_index=("N_MEASUREMENTS", profile_list))

In [9]:
#Add attributes
nc_profiles['profile_index'].attrs.update({
    'long_name': 'Index of the profile',
    'description': 'Each obs has an index that correspond to the number of the profile',
    'source': 'Profiling function from utils python',
    'processing_date': '2024-11-27'
})

In [10]:
# Step 1: Rechunk the dataset for efficient loading
nc_profiles = nc_profiles.chunk({"N_MEASUREMENTS": -1})

# Step 2: Drop unnecessary variables or dimensions
nc_profiles = nc_profiles[["CHLA", "PRES", "profile_index"]]  # Keep only relevant variables

# Step 3: Use parallel I/O with Dask
from dask.diagnostics import ProgressBar
import dask

dask.config.set(scheduler="threads")  # Enable multithreading

with ProgressBar():
    nc_profiles = nc_profiles.compute() 

[########################################] | 100% Completed | 450.01 s


In [11]:
nc_profiles['PRES'].values

array([  nan,   nan, -0.03, ...,   nan,   nan,  4.11], dtype=float32)

In [12]:
nc_profiles_grouped = nc_profiles.groupby('profile_index')

In [13]:
nc_profiles_grouped[165]

In [None]:
chl = nc_profiles_grouped[165]['CHLA'].values
pres = nc_profiles_grouped[165]['PRES'].values
import matplotlib.pyplot as plt
chl = interp_nan(chl)
pres = interp_nan(pres)

#Simple plot
plt.plot(chl, -pres)
plt.show() 

Basic operation n#1 - looping through profiles to interpolate and smooth te chla profiles. 

In [17]:
import math

In [21]:
chla_t = nc_profiles["CHLA"].where(nc_profiles["profile_index"] == 165, drop=True).values

In [None]:
%%time

chla_smoothed = []
pres_interp = []

for prof in range(max(profile_list)):
    chla_t = nc_profiles["CHLA"].where(nc_profiles["profile_index"] == prof, drop=True).values
    pres_t = nc_profiles["PRES"].where(nc_profiles["profile_index"] == prof, drop=True).values

    if any(not math.isnan(x) for x in chla_t):
        chla_t = interp_nan(chla_t)
        pres_t = interp_nan(pres_t)

        chla_t = slide(chla_t, k = 2)

    chla_smoothed.append(chla_t)
    pres_interp.append(pres_interp)

chla_smoothed = np.concatenate(chla_smoothed, axis = None)
pres_interp = np.concatenate(pres_interp, axis = None)

nc_profiles = nc_profiles.assign(chla_adj=("N_MEASUREMENTS", chla_smoothed))
nc_profiles = nc_profiles.assign(pres_adj=("N_MEASUREMENTS", pres_interp))

In [None]:
# dat2 = dat.assign(CHLA_ADJUSTED=lambda x: x.CHLA / 2)
# dat2['CHLA_ADJUSTED'].attrs.update({
#     'units': 'mg',
#     'description': 'Chlorophyll adjusted by dividing original CHLA by 2',
#     'source': 'ECO Puk sensor',
#     'notes': 'Data adjusted for experimental purposes',
#     'processing_date': '2024-11-21'
# })