In [1]:
#import xarray
#import fsspec
import numpy as np
#import s3fs
#import proplot
import os

from matplotlib import pyplot as plt
import pandas
import json
import pyarrow
import pyarrow.parquet as pyarrow_parquet


In [2]:
def set_metadata(tbl, col_meta={}, tbl_meta={}):
    """Store table- and column-level metadata as json-encoded byte strings.

    Table-level metadata is stored in the table's schema.
    Column-level metadata is stored in the table columns' fields.

    To update the metadata, first new fields are created for all columns.
    Next a schema is created using the new fields and updated table metadata.
    Finally a new table is created by replacing the old one's schema, but
    without copying any data.

    Args:
        tbl (pyarrow.Table): The table to store metadata in
        col_meta: A json-serializable dictionary with column metadata in the form
            {
                'column_1': {'some': 'data', 'value': 1},
                'column_2': {'more': 'stuff', 'values': [1,2,3]}
            }
        tbl_meta: A json-serializable dictionary with table-level metadata.
    """
    # Create updated column fields with new metadata
    if col_meta or tbl_meta:
        fields = []
        for col in tbl.schema.names:
            if col in col_meta:
                # Get updated column metadata
                metadata = tbl.field(col).metadata or {}
                for k, v in col_meta[col].items():
                    metadata[k] = json.dumps(v).encode('utf-8')
                # Update field with updated metadata
                fields.append(tbl.field(col).with_metadata(metadata))
            else:
                fields.append(tbl.field(col))
        
        # Get updated table metadata
        tbl_metadata = tbl.schema.metadata or {}
        for k, v in tbl_meta.items():
            if type(v)==bytes:
                tbl_metadata[k] = v
            else:
                tbl_metadata[k] = json.dumps(v).encode('utf-8')

        # Create new schema with updated field metadata and updated table metadata
        schema = pyarrow.schema(fields, metadata=tbl_metadata)

        # With updated schema build new table (shouldn't copy data)
        # tbl = pa.Table.from_batches(tbl.to_batches(), schema)
        tbl = tbl.cast(schema)

    return tbl

In [3]:
def Get_CODA_id(dates,source_data,obs_platform,year,profile_count):
    
    CODA_id_length = 20
    CODA_ID = np.array(['' for _ in range(dates.size)], dtype='S{}'.format(CODA_id_length))
    
    unique_dates = np.unique(dates)
    #print(unique_dates)
    #print(dates)
    for i_date in unique_dates:
    
        if not pandas.isnull(i_date):

            
            
            idx_for_date   =  np.nonzero(np.atleast_1d(dates) == i_date)[0]
            count_for_date =  idx_for_date.size
            
            for i_index in idx_for_date:
                #print(i_index)
                #CODA_id_current_profiles = source_data + obs_platform + i_date.strftime('%Y%m%d') + f'{i_profile:04}' 
                CODA_id_current_profiles = source_data + obs_platform + np.datetime_as_string(i_date, unit='D').replace('-','') + f'{profile_count:04}'
                CODA_ID[i_index] = CODA_id_current_profiles
                
            #END for i_index
        #if not isnull
    #END for i_date

    bad_dates_idx = np.nonzero(pandas.isnull(np.atleast_1d(dates)))[0]
 
    bad_profile_counter = 1
    
    for i_bad_idx in bad_dates_idx:
        CODA_id_current_profiles = source_data + obs_platform + str(year) + 'XXXX' + f'{bad_profile_counter:04}'
        CODA_ID[i_bad_idx] = CODA_id_current_profiles
        bad_profile_counter = bad_profile_counter+1

    return CODA_ID

In [4]:
import sys
sys.path.append('/home/cha674/wodpy/wodpy')
import wodnc

In [6]:
START_YEAR=2021
END_YEAR  =2024

In [None]:
import pyarrow


WOD_directory = '/datasets/work/soop-xbt/work/CARS/WOD/'
#WOD_directory = '///datasets/work/soop-xbt/work/IQuOD'
platforms_to_get = ['ctd','pfl','osd','xbt','mrb','mbt','apb','gld'] #'mtb','apb','gld'] '', ,

WOD_file_name_stem = 'wod_'

profile_vars_to_get   = ['Temperature','Salinity','Oxygen','Chlorophyll','Silicate','Phosphate','Nitrate','pH','Alkalinity','tCO2','z']
point_vars_to_get     = ['lat','lon','wod_unique_cast']
ancillary_vars_to_get = ['origflagset','country','dataset','Access_no','Recorder',
                         'dbase_orig','Platform','Project','WOD_cruise_identifier',
                         'Institute','needs_z_fix','Ocean_Vehicle','Temperature_Instrument']

output_path = '/datasets/work/soop-xbt/work/CARS/CODA'

FILL_VALUE = -10000000000.0


column_names = point_vars_to_get.copy()
column_names = column_names.append(profile_vars_to_get)



for i_year in range(START_YEAR,END_YEAR+1):
    print('Working on year: ', i_year)
    for i_platform in platforms_to_get:
        print('Working on platform: ', i_platform)

        file_name  = WOD_file_name_stem + i_platform + '_' + str(i_year) + '.nc'
        if os.path.isfile(os.path.join(WOD_directory,str(i_year),file_name)):
        
            WOD_ragged_object = wodnc.Ragged(os.path.join(WOD_directory,str(i_year),file_name))
        else:
            print('No file named ', file_name, '. Skipping')
            continue 
        
        vars_in_dataset = list(WOD_ragged_object.variables().keys())
        
        
        n_profiles = WOD_ragged_object.ncasts()
        
        
        profile_data_container   = {}
        ancillary_data_container = {}
        attributes_data_container = {}
        
        data_vars_to_get = profile_vars_to_get + point_vars_to_get
        
        #data_vars_to_get.append(point_vars_to_get)
        
        
        for i_var in data_vars_to_get:
            if i_var in vars_in_dataset:
                
                profile_data_container[i_var]    = []
                attributes_data_container[i_var] = {}

                if i_var in profile_vars_to_get:
                    profile_data_container[i_var + '_WODflag']  = []
                    
                    if i_var + '_origflag' in vars_in_dataset:
                        profile_data_container[i_var + '_origflag'] = []
                    
                    attributes_data_container[i_var + '_WODflag']  = {}
                    attributes_data_container[i_var + '_origflag'] = {}
        
        for i_ancillary_var in ancillary_vars_to_get:
            if i_ancillary_var in vars_in_dataset:
                
                ancillary_data_container[i_ancillary_var]  = []
                attributes_data_container[i_ancillary_var] = {}
        
        
        
        vars_to_CODA = list(profile_data_container.keys())
        
        profile_data_container['time']     = []
        profile_data_container['CODA_id']  = []

        
        
        # = list(WOD_ragged_object.variables().keys())

        number_of_levels_by_profile = []
        
        #DEBUG
        #n_profiles = 1000
        for i_profile in range(0,n_profiles):
            
             #= pandas.DataFrame(columns=column_names)
            if (i_profile % 1000) ==0:
                print("Profile: ", i_profile, ' of ', n_profiles)
            
            WOD_profile_object   = wodnc.ncProfile(WOD_ragged_object,i_profile)      
            n_levels             = WOD_profile_object.n_levels()


            if n_levels !=0:
                number_of_levels_by_profile.append(n_levels)
            else:
                number_of_levels_by_profile.append(1)
            
            for i_var in vars_to_CODA:
                if i_var in vars_in_dataset:
                    if WOD_profile_object.is_level_data(i_var):
                
                        current_variable     = WOD_profile_object.level_unpack(i_var)

                        #Test for missing data on profile
                        if current_variable.size != 0:
                            profile_data_container[i_var].append(current_variable)
                        else:
                            if n_levels ==0:
                                profile_data_container[i_var].append(np.asarray([np.nan]))
                            else:
                                profile_data_container[i_var].append(np.repeat(np.nan,n_levels))
                             #END if n_levels ==0

                        #END if current_variable.size != 0:
                    
                    elif WOD_profile_object.is_metadata(i_var):
                        current_point_data            = WOD_profile_object.metadata(i_var)
                        if n_levels !=0:
                            profile_data_container[i_var].append( np.repeat(current_point_data,n_levels) )
                        else:
                            profile_data_container[i_var].append( np.repeat(current_point_data,1) )
                    #END if WOD_profile_object.is_level_data(i_var)
                #END if i_var in vars_in_dataset
                if i_profile == 0:
                    variable_attributes = WOD_profile_object.show_variable_attr(i_var)
                    for i_attribute in variable_attributes:
                        attributes_data_container[i_var][i_attribute] = str(WOD_profile_object.get_variable_attr(i_var,i_attribute))
                    #END for i_attribute
                #END if i_profile ==0:
            
            #END for i_var in vars_to_CODA
            
            #Extract datetimes
            current_time = WOD_profile_object.datetime()
            current_datetime = np.datetime64(current_time).astype('datetime64[s]')

            if n_levels!=0:
                profile_data_container['time'].append( np.repeat(current_datetime,n_levels) )
            else:
                #print('n_levels=0')
                profile_data_container['time'].append( np.asarray([current_datetime]) )
            #END if n_levels!=0
            attributes_data_container['time']={'long_name': 'date'}

            
            for i_ancillary_var in ancillary_data_container.keys():
                current_ancillary_var = WOD_profile_object.metadata(i_ancillary_var)
                
                if isinstance(current_ancillary_var,str):
                    ancillary_data_container[i_ancillary_var].append(f"{current_ancillary_var:<100}")
                else:
                    ancillary_data_container[i_ancillary_var].append(current_ancillary_var)
                #END if isinstance()
                if i_profile ==0:
                    variable_attributes = WOD_profile_object.show_variable_attr(i_ancillary_var)
                    for i_attribute in variable_attributes:
                        attributes_data_container[i_ancillary_var][i_attribute] = str(WOD_profile_object.get_variable_attr(i_ancillary_var,i_attribute))
                    #END for i_attribute in variable_attributes
                #END if i_profile
            #END for i_ancillary_variable    

            
            source_data   = 'WOD'

            
            #Set the CODA Identifier
            CODA_id = Get_CODA_id( profile_data_container['time'][-1][0],source_data,i_platform,i_year,i_profile)
            if n_levels != 0:
                profile_data_container['CODA_id'].append(np.repeat(CODA_id,n_levels))
            else:
                profile_data_container['CODA_id'].append(np.repeat(CODA_id,1))
            attributes_data_container['CODA_id'] = {'Comment':'Unique CODA identifier with format <Source Dataset><Obs Platform><Counter for date><YYYYMMDD>'}

            #print(CODA_id) 
        #END for i_profile
        
        
        #Build Dataframe
        print('Building dataframe')
        wod_dataframe = pandas.DataFrame(columns=profile_data_container.keys())

        for i_var in profile_data_container:
            wod_dataframe[i_var] = np.concatenate(profile_data_container[i_var])
        #END for i_var
        print("Built basic dataframe")
        for i_ancillary_var in ancillary_data_container.keys():
            print(i_ancillary_var)
            current_ancillary_var = []
    
            for i_profile in range(0,n_profiles):
                current_ancillary_var.append( np.repeat(ancillary_data_container[i_ancillary_var][i_profile], 
                                                        number_of_levels_by_profile[i_profile]) )
            #END for i_profile    
            n_columns = len(wod_dataframe.columns)

            wod_dataframe.insert(n_columns,i_ancillary_var,np.concatenate(current_ancillary_var) )
                            
        #END for i_ancillary

        wod_pyarrow_table = pyarrow.Table.from_pandas(wod_dataframe)
        wod_pyarrow_table = set_metadata(wod_pyarrow_table, col_meta=attributes_data_container, tbl_meta={'Parent file':file_name})
        

        
        output_file_name = 'WOD2018_CODA_' + str(i_year) + '_' + str(i_platform) + '.parquet'
        print('Writing file:', output_file_name)
        if not os.path.isdir(os.path.join(output_path,str(i_year))):
            os.mkdir(os.path.join(output_path,str(i_year)))
        pyarrow_parquet.write_table(wod_pyarrow_table, os.path.join(output_path,str(i_year),output_file_name),compression='snappy')
        #wod_dataframe.to_parquet(os.path.join(output_path,str(i_year),output_file_name),engine='pyarrow',compression='snappy')

Working on year:  2021
Working on platform:  ctd
Profile:  0  of  12168
Profile:  1000  of  12168
Profile:  2000  of  12168
Profile:  3000  of  12168
Profile:  4000  of  12168
Profile:  5000  of  12168
Profile:  6000  of  12168
Profile:  7000  of  12168
Profile:  8000  of  12168
Profile:  9000  of  12168
Profile:  10000  of  12168
Profile:  11000  of  12168
Profile:  12000  of  12168
Building dataframe
Built basic dataframe
origflagset
country
dataset
Access_no
Recorder
dbase_orig
Platform
WOD_cruise_identifier
Institute
needs_z_fix
Temperature_Instrument
Writing file: WOD2018_CODA_2021_ctd.parquet
Working on platform:  pfl
Profile:  0  of  171319
Profile:  1000  of  171319
Profile:  2000  of  171319
Profile:  3000  of  171319
Profile:  4000  of  171319
Profile:  5000  of  171319
Profile:  6000  of  171319
Profile:  7000  of  171319
Profile:  8000  of  171319
Profile:  9000  of  171319
Profile:  10000  of  171319
Profile:  11000  of  171319
Profile:  12000  of  171319
Profile:  13000  

np.datetime64('2010-01-01T07:45:00')