# Making a BGC Argo Dataframe

In [1]:
import os
import pandas as pd
import xarray as xr
import sys
sys.path.append('C:/Users/flapet/OneDrive - NOC/Documents/utils_python')
from functions.float_download import *
import urllib3
import shutil
import requests

In [2]:
wmo_number = [6990636, 3901581, 1902695, 4903659, 7902223, 1902637]

In [3]:
root = 'C:/Users/flapet/OneDrive - NOC/Documents/IDAPro/lib/db_building'
profile_dir = root + '/data/argo_nc/'
natl_dir = profile_dir + 'North_Atlantic/'

# Create GO-BGC folders if they do not exist yet
if 'data' not in os.listdir(root):
  os.mkdir(root + '/data')
if 'argo_nc' not in os.listdir(root + '/data'):
  os.mkdir(profile_dir)

In [None]:
wmoids, gdac_index, downloaded_filenames \
                   = argo_gdac(floats=wmo_number, save_to=profile_dir, overwrite_index=True, overwrite_profiles=True)

## Take a look at floats position

In [4]:
import plotly.graph_objects as go

In [19]:
# Choose whether to plot temperature ('temp'), salinity ('psal'), chlorophyll ('chla'), backscatter ('bbp'),
#                        dissolved oxygen ('doxy'), nitrate ('NO3'), or pH ('pH') on map
which_param = 'temp'

# Set up Plotly canvas
fig = go.Figure(go.Scattergeo())

# Iterate through floats downloaded
for filename in [f for f in os.listdir(profile_dir) if os.path.isfile(os.path.join(profile_dir, f)) and f.endswith('.nc')]:

  # Load float data
  data = xr.open_dataset(profile_dir + filename)

  # Organize data for hover text
  hov = data[['CYCLE_NUMBER','JULD','LATITUDE','LONGITUDE']].to_pandas()

  # Calculate near-surface parameter values
  if which_param == 'temp':
    param_name = 'TEMP'; cmin = 15; cmax = 13; colorscale = 'Thermal'
    title = 'Temperature (°C)'; hover_str = 'Near-surface temperature: %{customdata[4]:.02f}°C'
  elif which_param == 'psal':
    param_name = 'PSAL_ADJUSTED'; cmin = 35.5; cmax = 37.5; colorscale = 'haline'
    title = 'Salinity (PSU)'; hover_str = 'Near-surface salinity: %{customdata[4]:.02f} PSU'
  elif which_param == 'chla':
    param_name = 'CHLA_ADJUSTED'; cmin = 0.0; cmax = 1.5; colorscale = 'algae'
    title = 'Chlorophyll-a (mg/m^3)'; hover_str = 'Near-surface chlorophyll-a: %{customdata[4]:.02f} mg/m^3'
  elif which_param == 'bbp':
    param_name = 'BBP700'; cmin = 0.0; cmax = 0.001; colorscale = 'matter'
    title = 'Particle backscattering at 700 nm (m^-1)'; hover_str = 'Near-surface backscattering: %{customdata[4]:.05f} m^-1'
  elif which_param == 'doxy':
    param_name = 'DOXY_ADJUSTED'; cmin = 190; cmax = 350; colorscale = 'ice'
    title = 'Dissolved oxygen (µmol/kg)'; hover_str = 'Near-surface dissolved oxygen: %{customdata[4]:.02f} µmol/kg'
  elif which_param == 'NO3':
    param_name = 'NITRATE_ADJUSTED'; cmin = 0.0; cmax = 1.5; colorscale = 'PuRd'
    title = 'Nitrate (µmol/kg)'; hover_str = 'Near-surface nitrate: %{customdata[4]:.02f} µmol/kg'
  elif which_param == 'pH':
    param_name = 'PH_IN_SITU_TOTAL_ADJUSTED'; cmin = 7.95; cmax = 8.10; colorscale = 'Sunset'
    title = 'pH'; hover_str = 'Near-surface pH: %{customdata[4]:.02f}'
  hov['NEAR_SURF'] = [prof[~np.isnan(prof)][0] if len(prof[~np.isnan(prof)]) > 1 else np.NaN for prof in data[param_name].values]

  # Add float trajectory to map
  fig.add_trace(go.Scattermapbox(mode = 'lines',
                                 lon = hov['LONGITUDE'],
                                 lat = hov['LATITUDE'],
                                 marker = go.scattermapbox.Marker(color='white',size=5),
                                 showlegend = False))
  fig.add_trace(go.Scattermapbox(mode = 'markers',
                                 lon = hov['LONGITUDE'],
                                 lat = hov['LATITUDE'],
                                 marker = go.scattermapbox.Marker(color=hov['NEAR_SURF'],size=10,cmin=cmin,cmax=cmax,
                                                                  colorscale=colorscale,
                                                                  colorbar={'title':title,'titleside':'right'},
                                                                  showscale=True),
                                 customdata = hov.values,
                                 hovertemplate = 'Profile #: %{customdata[0]}<br>Time: %{customdata[1]}<br>' + \
                                    'Latitude: %{customdata[2]:.02f}<br>Longitude: %{customdata[3]:.02f}<br>' + hover_str,
                                 name = 'Float: ' + data['PLATFORM_NUMBER'].values[0].decode('utf-8'),
                                 showlegend = False))

# Formatting and basemap
fig.update_layout(margin = {'l':0,'t':0,'b':0,'r':0},
                  height = 500,
                  width = 1000,
                  autosize = False,
                  hovermode = 'closest',
                  mapbox = {'style':'open-street-map',
                            'center':{'lon':-20,'lat':60},
                            'zoom':4.0})

## GroupBy demo

In [39]:
import polars as pl
import pyarrow.parquet as pq

In [73]:
display(data)

In [78]:
%%time

variables = ['PLATFORM_NUMBER', 'N_PROF', 'JULD', 'LONGITUDE', 'LATITUDE', 'PRES', 'TEMP', 'PSAL', 'CHLA_ADJUSTED', 'BBP700_ADJUSTED']

df_list = []

for filename in [f for f in os.listdir(profile_dir) if os.path.isfile(os.path.join(profile_dir, f)) and f.endswith('.nc')]:
    filepath = profile_dir + filename
    ds = xr.open_dataset(filepath)
    df = pl.DataFrame(ds[variables].to_dataframe())
    df = df.with_columns(
        pl.col("PLATFORM_NUMBER").cast(pl.String)) #Platform number is interpreted as binary, convert it into a string
    df_list.append(df)

argo_table = pl.concat(df_list)

argo_table.write_parquet(root + '/data/argo_pq/biocarbon_floats_table.parquet')

CPU times: total: 1.2 s
Wall time: 1.23 s


In [79]:
argo_table.head()

PLATFORM_NUMBER,JULD,LONGITUDE,LATITUDE,PRES,TEMP,PSAL,CHLA_ADJUSTED,BBP700_ADJUSTED
str,datetime[ns],f64,f64,f32,f32,f32,f32,f32
"""1902637 """,2023-06-19 21:24:08.000001536,-19.106998,51.041874,-0.01,17.5968,0.008,,
"""1902637 """,2023-06-19 21:24:08.000001536,-19.106998,51.041874,0.035,17.609264,2.40992,0.7446,0.004327
"""1902637 """,2023-06-19 21:24:08.000001536,-19.106998,51.041874,0.135,17.636965,7.74752,0.7446,0.002622
"""1902637 """,2023-06-19 21:24:08.000001536,-19.106998,51.041874,0.235,17.664665,13.085119,0.7446,0.002653
"""1902637 """,2023-06-19 21:24:08.000001536,-19.106998,51.041874,0.335,17.692366,18.42272,0.7446,0.002558


In [81]:
res = argo_table.group_by(['PLATFORM_NUMBER']).len()
print(res)

shape: (7, 2)
┌─────────────────┬────────┐
│ PLATFORM_NUMBER ┆ len    │
│ ---             ┆ ---    │
│ str             ┆ u32    │
╞═════════════════╪════════╡
│ 6990636         ┆ 61541  │
│ 1902637         ┆ 120160 │
│ 3901581         ┆ 128457 │
│ 1902695         ┆ 32614  │
│ 4903659         ┆ 91636  │
│ 7902223         ┆ 18088  │
│ 5904183         ┆ 129781 │
└─────────────────┴────────┘
