## Take an existing WOD ragged array format file and create a new empty one ready for populating with new data
### Then read all the AIMS csv data files and fill the WOD ragged array version with the data

Bec Cowley, April 2023

In [1]:
import xarray as xr
import os
import netCDF4 as nc
import pandas as pd
from datetime import datetime as dt


# Using NetCDF tools

In [2]:
# file name for our starting point
infile='/oa-decadal-climate/work/observations/WOD_CARS2022/2018/wod_ctd_2018.nc'
# output file that should have the same variables and no data included
outfile = '/oa-decadal-climate/work/observations/CARSv2_ancillary/outputNC/test.nc'

In [3]:
# using Netcdf tools, make an identical outputfile with no data
varnames = []
with nc.Dataset(infile) as src, nc.Dataset(outfile,'w') as dst:
    # copy global attribute names
    dst.setncatts(src.__dict__)
    # try adjusting a couple of global attributes:
    dst.setncattr('institution', 'AIMS')
    # copy dimensions
    for name, dimension in src.dimensions.items():
        dst.createDimension(name, None)
    #copy only the variables that we are interested in:
    for name, variable in src.variables.items():
        x = dst.createVariable(name, variable.datatype, variable.dimensions)
        #and variable attributes
        dst[name].setncatts(src[name].__dict__)
        #let's grab the variable names to map the AIMS data to
        varnames.append(name)
    

In [4]:
varnames[0:30]

['country',
 'WOD_cruise_identifier',
 'originators_cruise_identifier',
 'wod_unique_cast',
 'originators_station_identifier',
 'lat',
 'lon',
 'time',
 'date',
 'GMT_time',
 'Access_no',
 'Platform',
 'Institute',
 'Cast_Tow_number',
 'Orig_Stat_Num',
 'Bottom_Depth',
 'Cast_Direction',
 'High_res_pair',
 'dataset',
 'Recorder',
 'real_time',
 'dbase_orig',
 'origflagset',
 'z',
 'z_WODflag',
 'z_origflag',
 'z_sigfigs',
 'z_row_size',
 'Temperature',
 'Temperature_sigfigs']

In [31]:
# set up a dictionary to map the variable names between the WOD and AIMS formats:
wodnames = ['Oxygen', 'Chlorophyll','Salinity','Pressure','Temperature']
aimsnames = ['Oxygen (µmol/kg)','Chl (f)','Salinity','Pressure (dB)','Temp(°C)']
vardict = dict(zip(aimsnames,wodnames))
print(vardict)

{'Oxygen (µmol/kg)': 'Oxygen', 'Chl (f)': 'Chlorophyll', 'Salinity': 'Salinity', 'Pressure (dB)': 'Pressure', 'Temp(°C)': 'Temperature'}


In [98]:
# set up a dictionary to map the cast dimensioned names to the aims names
wodnames = ['originators_station_identifier', 'lat', 'lon', 'date', 'Bottom_Depth']
aimsnames = ['STATION NAME','LATITUDE','LONGITUDE','SAMPLE DATE', 'TO DEPTH']
castdict = dict(zip(aimsnames,wodnames))
print(castdict)

{'STATION NAME': 'originators_station_identifier', 'LATITUDE': 'lat', 'LONGITUDE': 'lon', 'SAMPLE DATE': 'date', 'TO DEPTH': 'Bottom_Depth'}


# Now using xarray tools and getting actual data in

In [43]:
from pathlib import Path
# now let's read in the csv CTD data from AIMS
AIMS_data_path ='/oa-decadal-climate/work/observations/CARSv2_ancillary/AIMS/'
dir_contents = os.listdir(AIMS_data_path)

filelist = Path(AIMS_data_path).rglob('*.csv')
        

In [44]:
# testing at the moment
filn = str(next(filelist))

In [45]:
# try pandas to read the file
df = pd.read_csv(filn, skiprows=15)
df['PARAMETER'].unique()

array(['%Trans', 'C', 'Chl (f)', 'Depth (m)', 'Latitude (deg N)',
       'Longitude (deg E)', 'OBS (NTU)', 'Oxygen (µmol/kg)', 'PAR(%)',
       'Pressure (dB)', 'Salinity', 'Sigma-t (Kg/m3)', 'Temp(°C)',
       'Water depth (m)'], dtype=object)

In [22]:
# read our empty nc file into an xarray dictionary:
ds = xr.open_dataset(outfile)
ds

In [40]:
# Loop through the keys in the data dictionary
# for key in vardict:
# find all rows in dataframe that match the key:
matches = df.loc[df['PARAMETER'] == 'Temp(°C)']
data= xr.DataArray(list(matches['VALUE']))
nobs = matches.size

#ds['Temperature_row_size'][:] =xr.DataArray(nobs)
#ds['Temperature'][:] = xr.DataArray(matches['VALUE'], dims='Temperature_obs')
ds = xr.concat([ds['Temperature'],data], dim='Temperature_obs')


In [73]:
# are the lat/long values the same as in the header?
ind = df['PARAMETER'].str.contains('Lat')
dflats = df.loc[ind,:]
dflats
# yes it does, no need to read these as variables, just use header information

Unnamed: 0,DEPTH,PARAMETER,VALUE,QAQC_VALUE,QAQC_FLAG
4,1.0,Latitude (deg N),-18.60967,,
18,2.0,Latitude (deg N),-18.60967,,
32,3.0,Latitude (deg N),-18.60967,,
46,4.0,Latitude (deg N),-18.60967,,
60,5.0,Latitude (deg N),-18.60967,,
74,6.0,Latitude (deg N),-18.60967,,
88,7.0,Latitude (deg N),-18.60967,,
102,8.0,Latitude (deg N),-18.60967,,
116,9.0,Latitude (deg N),-18.60967,,
130,10.0,Latitude (deg N),-18.60967,,


In [62]:
# let's grab the header information
dfhead = pd.read_csv(filn, skiprows=range(16, 9999))
dfhead

Unnamed: 0,TITLE:,AIMS CTD Profile
0,ATTRIBUTION:,http://www.aims.gov.au/docs/cc-attribution.html
1,DISCLAIMER:,http://www.aims.gov.au/docs/disclaimer.html
2,PRIVACY:,http://www.aims.gov.au/docs/privacy-policy.html
3,COPYRIGHT:,http://www.aims.gov.au/docs/cc-copyright.html
4,FILE CREATED:,18-11-2022
5,STATION NAME:,AUV001
6,SAMPLE DATE:,26-02-2011
7,LONGITUDE:,146.4805
8,LATITUDE:,-18.6096667
9,FROM DEPTH,1.0


In [5]:
# read a file with the global attributes included
global_atts = pd.read_excel('/oa-decadal-climate/work/observations/CARSv2_ancillary/outputNC/cars_global_atts.xlsx', sheet_name='globals')

In [None]:
# assign the global attributes from the spreadsheet
for idx, row in global_atts.iterrows():
    

In [6]:
# take a look at our global attributes
global_atts

Unnamed: 0,Attribute,Value
0,institution,
1,source,Data collated from institution global attribut...
2,references,World Ocean Database 2018. URL:http://data.nod...
3,title,CARSv2 multiple cast file
4,summary,Data for multiple casts from source global att...
5,id,
6,naming_authority,gov.noaa.nodc
7,geospatial_lat_min,-75.29098
8,geospatial_lat_max,83.3485
9,geospatial_lat_resolution,point
