# Convert the hourly netCDF ragged array to parquet

In [None]:
import numpy as np
import xarray as xr
import pandas as pd
import awkward as ak

In [None]:
path_gdp = '../data/process/gdp_v2.00.nc'

## Awkward arrays supports two dimensional data thru nested structures, all variables with *metadata[traj]* and *data[obs]* can be combined into parquet files

In [None]:
ds = xr.open_dataset(path_gdp, decode_times=False) 

In [None]:
offset = ak.layout.Index32(np.insert(np.cumsum(ds.rowsize), 0, 0))

longitude = ak.layout.ListOffsetArray32(offset, ak.layout.NumpyArray(ds.longitude))
latitude = ak.layout.ListOffsetArray32(offset, ak.layout.NumpyArray(ds.latitude))
time = ak.layout.ListOffsetArray32(offset, ak.layout.NumpyArray(ds.time.values))
ids = ak.layout.ListOffsetArray32(offset, ak.layout.NumpyArray(ds.ids))
ve = ak.layout.ListOffsetArray32(offset, ak.layout.NumpyArray(ds.ve))
vn = ak.layout.ListOffsetArray32(offset, ak.layout.NumpyArray(ds.vn))
gap = ak.layout.ListOffsetArray32(offset, ak.layout.NumpyArray(ds.gap))
err_lat = ak.layout.ListOffsetArray32(offset, ak.layout.NumpyArray(ds.err_lat))
err_lon = ak.layout.ListOffsetArray32(offset, ak.layout.NumpyArray(ds.err_lon))
err_ve = ak.layout.ListOffsetArray32(offset, ak.layout.NumpyArray(ds.err_ve))
err_vn = ak.layout.ListOffsetArray32(offset, ak.layout.NumpyArray(ds.err_vn))
drogue_status = ak.layout.ListOffsetArray32(offset, ak.layout.NumpyArray(ds.drogue_status))
sst = ak.layout.ListOffsetArray32(offset, ak.layout.NumpyArray(ds.sst))
sst1 = ak.layout.ListOffsetArray32(offset, ak.layout.NumpyArray(ds.sst1))
sst2 = ak.layout.ListOffsetArray32(offset, ak.layout.NumpyArray(ds.sst2))
err_sst = ak.layout.ListOffsetArray32(offset, ak.layout.NumpyArray(ds.err_sst))
err_sst1 = ak.layout.ListOffsetArray32(offset, ak.layout.NumpyArray(ds.err_sst1))
err_sst2 = ak.layout.ListOffsetArray32(offset, ak.layout.NumpyArray(ds.err_sst2))
flg_sst = ak.layout.ListOffsetArray32(offset, ak.layout.NumpyArray(ds.flg_sst))
flg_sst1 = ak.layout.ListOffsetArray32(offset, ak.layout.NumpyArray(ds.flg_sst1))
flg_sst2 = ak.layout.ListOffsetArray32(offset, ak.layout.NumpyArray(ds.flg_sst2))

In [None]:
obs = ak.Array(
        ak.layout.RecordArray(
            [
                longitude,
                latitude,
                time,
                ids,
                ve,
                vn,
                gap,
                err_lat,
                err_lon,
                err_ve,
                err_vn,
                drogue_status,
                sst,
                sst1,
                sst2,
                err_sst,
                err_sst1,
                err_sst2,
                flg_sst,
                flg_sst1,
                flg_sst2,
            ],
            [
                "longitude",
                "latitude",
                "time",
                "ids",
                "ve",
                "vn",
                "gap",
                "err_lat",
                "err_lon",
                "err_ve",
                "err_vn",
                "drogue_status",
                "sst",
                "sst1",
                "sst2",
                "err_sst",
                "err_sst1",
                "err_sst2",
                "flg_sst",
                "flg_sst1",
                "flg_sst2",
            ],
        )
    )

In [None]:
ak.layout.NumpyArray(ds.ManufactureSensorType)

In [None]:
array = ak.Array(
        ak.layout.RecordArray(
            [
                ak.layout.NumpyArray(ds.ID),
                ak.layout.NumpyArray(ds.rowsize),
                ak.layout.NumpyArray(ds.location_type),
                ak.layout.NumpyArray(ds.WMO),                
                ak.layout.NumpyArray(ds.expno),               
                ak.layout.NumpyArray(ds.deploy_date.values),
                ak.layout.NumpyArray(ds.deploy_lat),
                ak.layout.NumpyArray(ds.deploy_lon),
                ak.layout.NumpyArray(ds.end_date.values),
                ak.layout.NumpyArray(ds.end_lat),
                ak.layout.NumpyArray(ds.end_lon),
                ak.layout.NumpyArray(ds.drogue_lost_date.values),
                ak.layout.NumpyArray(ds.type_death),
                #ak.layout.NumpyArray(ds.type_buoy),
                #ak.layout.NumpyArray(ds.DeploymentShip),
                #ak.layout.NumpyArray(ds.DeploymentStatus),
                #ak.layout.NumpyArray(ds.BuoyTypeManufacturer),
                #ak.layout.NumpyArray(ds.BuoyTypeSensorArray),
                ak.layout.NumpyArray(ds.CurrentProgram),
                #ak.layout.NumpyArray(ds.PurchaserFunding),
                #ak.layout.NumpyArray(ds.SensorUpgrade),
                #ak.layout.NumpyArray(ds.Transmissions),
                #ak.layout.NumpyArray(ds.DeployingCountry),
                #ak.layout.NumpyArray(ds.DeploymentComments),
                ak.layout.NumpyArray(ds.ManufactureYear),
                ak.layout.NumpyArray(ds.ManufactureMonth),
                #ak.layout.NumpyArray(ds.ManufactureSensorType),
                ak.layout.NumpyArray(ds.ManufactureVoltage),
                ak.layout.NumpyArray(ds.FloatDiameter),
                ak.layout.NumpyArray(ds.SubsfcFloatPresence),
                #ak.layout.NumpyArray(ds.DrogueType),
                ak.layout.NumpyArray(ds.DrogueLength),
                ak.layout.NumpyArray(ds.DrogueBallast),
                ak.layout.NumpyArray(ds.DragAreaAboveDrogue),
                ak.layout.NumpyArray(ds.DragAreaOfDrogue),
                ak.layout.NumpyArray(ds.DragAreaRatio),
                ak.layout.NumpyArray(ds.DrogueCenterDepth),
                #ak.layout.NumpyArray(ds.DrogueDetectSensor),
                obs.layout,
            ],
            [
                "ID",
                "rowsize",
                "location_type",
                "WMO",
                "expno",
                "deploy_date",
                "deploy_lat",
                "deploy_lon",
                "end_date",
                "end_lat",
                "end_lon",
                "drogue_lost_date",
                "type_death",
                #"type_buoy",
                #"DeploymentShip",
                #"DeploymentStatus",
                #"BuoyTypeManufacturer",
                #"BuoyTypeSensorArray",
                "CurrentProgram",
                #"PurchaserFunding",
                #"SensorUpgrade",
                #"Transmissions",
                #"DeployingCountry",
                #"DeploymentComments",
                "ManufactureYear",
                "ManufactureMonth",
                #"ManufactureSensorType",
                "ManufactureVoltage",
                "FloatDiameter",
                "SubsfcFloatPresence",
                #"DrogueType",
                "DrogueLength",
                "DrogueBallast",
                "DragAreaAboveDrogue",
                "DragAreaOfDrogue",
                "DragAreaRatio",
                "DrogueCenterDepth",
                #"DrogueDetectSensor",
                "obs",
            ],
        )
    )

In [None]:
gdp_parquet = 'gdp_v2.00.parquet'

ak.to_parquet(array, 
              gdp_parquet,
              compression="zstd",
              compression_level=9
             )

In [None]:
ds.close()