## Move some code here that is me learning to read AIMS files and get them into a dataframe
## Then convert to xarray and export to NC

In [174]:
import pandas as pd
import xarray as xr
import matplotlib.pyplot as plt
import datetime as dt
from pathlib import Path
import os
import sys
sys.path.insert(1, '/oa-decadal-climate/work/observations/oceanobs_data/UOT/programs/data-services/lib/python/')
import numpy as np
from netCDF4 import Dataset, date2num
from generate_netcdf_att import generate_netcdf_att, get_imos_parameter_info

In [2]:
# create a new xarray from the csv file

# now let's read in the csv CTD data from AIMS
AIMS_data_path ='/oa-decadal-climate/work/observations/CARSv2_ancillary/AIMS/'
dir_contents = os.listdir(AIMS_data_path)

filelist = Path(AIMS_data_path).rglob('*.csv')# read the csv file
filn = str(next(filelist))
df = pd.read_csv(filn, skiprows=15)

# convert to xarray:
new_data = df.to_xarray()

In [3]:
df

Unnamed: 0,DEPTH,PARAMETER,VALUE,QAQC_VALUE,QAQC_FLAG
0,1.0,%Trans,,,
1,1.0,C,2.53090,,
2,1.0,Chl (f),0.74269,,
3,1.0,Depth (m),1.00000,,
4,1.0,Latitude (deg N),-18.60967,,
...,...,...,...,...,...
303,22.0,Pressure (dB),,,
304,22.0,Salinity,33.20610,,
305,22.0,Sigma-t (Kg/m3),20.83640,,
306,22.0,Temp(°C),28.64150,,


In [4]:
dfgroup = df.groupby('PARAMETER')

In [5]:
dfgroup.ngroups

14

In [6]:
# set up a couple of dictionary to map the cast dimensioned names to the aims names
wodnames = ['originators_station_identifier', 'lat', 'lon', 'date', 'Bottom_Depth','Temperature','Pressure','Salinity','Oxygen', 'z']
aimsnames = ['STATION NAME','LATITUDE','LONGITUDE','SAMPLE DATE', 'TO DEPTH','Temp','Pres','Salinity','Oxygen','Depth']
castdict = dict(zip(aimsnames,wodnames))
castdict2 = dict(zip(wodnames,aimsnames))
print(castdict2)


{'originators_station_identifier': 'STATION NAME', 'lat': 'LATITUDE', 'lon': 'LONGITUDE', 'date': 'SAMPLE DATE', 'Bottom_Depth': 'TO DEPTH', 'Temperature': 'Temp', 'Pressure': 'Pres', 'Salinity': 'Salinity', 'Oxygen': 'Oxygen', 'z': 'Depth'}


In [94]:
aimsnames = ['LATITUDE','LONGITUDE', 'TO DEPTH','Temp','Pres','Salinity','Depth']
stdn = ['latitude', 'longitude', 'sea_floor_depth_blow_sea_surface','sea_water_temperature',
        'sea_water_pressure','sea_water_salinity','depth_below_sea_surface']
units=['degrees_north','degrees_east','m','degrees_C','dbar','1','m']
valid_min=[-90,-180,0,-2.5,-5,2,-5]
valid_max=[90,180,12000,40,12000,41,12000]
stddict = dict(zip(aimsnames,zip(stdn,zip(units,zip(valid_min,valid_max)))))
val = stddict['LATITUDE']
val

('latitude', ('degrees_north', (-90, 90)))

In [95]:
df['PARAMETER'].unique()
# of course, each file has a different set of parameter names and formats. 


array(['%Trans', 'C', 'Chl (f)', 'Depth (m)', 'Latitude (deg N)',
       'Longitude (deg E)', 'OBS (NTU)', 'Oxygen (µmol/kg)', 'PAR(%)',
       'Pressure (dB)', 'Salinity', 'Sigma-t (Kg/m3)', 'Temp(°C)',
       'Water depth (m)'], dtype=object)

In [96]:
dfhead = pd.read_csv(filn, skiprows=range(16, 9999))
df = pd.read_csv(filn, skiprows=15)
dfhead
#dfhead[dfhead.iloc[:,0].str.contains('TO DEPTH')]

Unnamed: 0,TITLE:,AIMS CTD Profile
0,ATTRIBUTION:,http://www.aims.gov.au/docs/cc-attribution.html
1,DISCLAIMER:,http://www.aims.gov.au/docs/disclaimer.html
2,PRIVACY:,http://www.aims.gov.au/docs/privacy-policy.html
3,COPYRIGHT:,http://www.aims.gov.au/docs/cc-copyright.html
4,FILE CREATED:,18-11-2022
5,STATION NAME:,AUV001
6,SAMPLE DATE:,26-02-2011
7,LONGITUDE:,146.4805
8,LATITUDE:,-18.6096667
9,FROM DEPTH,1.0


In [10]:
for value in castdict:
    if dfhead.iloc[:,0].str.contains(value).any():
        df2 = dfhead.loc[dfhead.iloc[:,0].str.contains(value), 'AIMS CTD Profile'].item()
        print(df2)

AUV001
-18.6096667
146.4805
26-02-2011
22.0


In [11]:
names = dfhead.columns

In [12]:
#dfhead[dfhead['TITLE:'] == 'STATION NAME:']
#df2 = dfhead.loc[dfhead.iloc[:,0].str.contains('STATION NAME:'), names[1]].item()
df2 = dfhead.loc[dfhead[names[0]].str.contains('STATION NAME:'), names[1]].item()
print(df2)

AUV001


In [13]:
#'Temp' in castdict.keys()
'Temp' in group

NameError: name 'group' is not defined

In [14]:
datalist = []
dfhead = pd.read_csv(filn, skiprows=range(15, 9999))
names = dfhead.columns
# and the data
df = pd.read_csv(filn, skiprows=15)#let's loop it over all the relevant parameters and add other info for the WOD format
#    df.insert(0,'Cast_Tow_number',cast)
# increment the cast number
#    cast = cast + 1
dfgroup = df.groupby('PARAMETER')

for group in df['PARAMETER'].unique():
    for value in castdict:
        if value in group:
            dat = dfgroup.get_group(group)
            dat = dat.set_index('DEPTH')
            dat = dat.drop(['PARAMETER','QAQC_VALUE'],axis=1)
            dat = dat.rename(columns={'VALUE':castdict[value],'QAQC_FLAG':castdict[value] + '_flag'}) 
            datalist.append(dat)
data = pd.concat(datalist, axis=1)
    # get the data from the header
for value in castdict:
    if dfhead[names[0]].str.contains(value).any():
        df2 = dfhead.loc[dfhead[names[0]].str.contains(value), names[1]].item()
        data.insert(0,castdict[value],df2)


In [15]:
data.count()

Bottom_Depth                      22
date                              22
lon                               22
lat                               22
originators_station_identifier    22
z                                 22
z_flag                             0
Oxygen                            22
Oxygen_flag                        0
Pressure                           0
Pressure_flag                      0
Salinity                          22
Salinity_flag                      0
Temperature                       22
Temperature_flag                   0
dtype: int64

In [16]:
# try setting up manually with WOD dimensions/coords
datalist = []
dfhead = pd.read_csv(filn, skiprows=range(15, 9999))
names = dfhead.columns
# and the data
df = pd.read_csv(filn, skiprows=15)#let's loop it over all the relevant parameters and add other info for the WOD format
#    df.insert(0,'Cast_Tow_number',cast)
# increment the cast number
#    cast = cast + 1
dfgroup = df.groupby('PARAMETER')


In [17]:
df['PARAMETER'].unique()

array(['%Trans', 'C', 'Chl (f)', 'Depth (m)', 'Latitude (deg N)',
       'Longitude (deg E)', 'OBS (NTU)', 'Oxygen (µmol/kg)', 'PAR(%)',
       'Pressure (dB)', 'Salinity', 'Sigma-t (Kg/m3)', 'Temp(°C)',
       'Water depth (m)'], dtype=object)

In [18]:
temperature = dfgroup.get_group('Temp(°C)')['VALUE']
depth = dfgroup.get_group('Depth (m)')['VALUE']


In [19]:
xrds = xr.Dataset(
    coords = dict(
        Temperature_obs = list(temperature),
        z_obs = list(depth)
        ),
    data_vars = dict(
        Temperature = (['Temperature_obs'], temperature),
        z = (['z_obs'], depth)
        )
    )

In [20]:
xrds

# OK, making the ragged array format is turning out to be not so easy
# Let's make individual files dimensioned by depth, like iMOS files

In [100]:
# set up a new dictionaries to map the cast dimensioned names to the aims names
# Have ignored all other parameters for now except for PSAL, TEMP, DEPTH. The other ones have mixed units and 
# haven't the time to parse the units appropriately.
imosnames = ['LATITUDE', 'LONGITUDE', 'TIME', 'BOT_DEPTH','TEMP','PRES_REL','PSAL']
aimsnames = ['LATITUDE','LONGITUDE','SAMPLE DATE', 'TO DEPTH','Temp','Pres','Salinity']
vardict = dict(zip(aimsnames,imosnames))
vardict2 = dict(zip(imosnames,aimsnames))
print(vardict2)
imosglobnames = ['cruise', 'disclaimer', 'attribution', 'license']
aimsglobnames = ['STATION NAME', 'DISCLAIMER', 'ATTRIBUTION', 'COPYRIGHT']
globdict = dict(zip(aimsglobnames,imosglobnames))
print(globdict)

{'LATITUDE': 'LATITUDE', 'LONGITUDE': 'LONGITUDE', 'TIME': 'SAMPLE DATE', 'BOT_DEPTH': 'TO DEPTH', 'TEMP': 'Temp', 'PRES_REL': 'Pres', 'PSAL': 'Salinity'}
{'STATION NAME': 'cruise', 'DISCLAIMER': 'disclaimer', 'ATTRIBUTION': 'attribution', 'COPYRIGHT': 'license', 'FILE CREATED': 'date_created'}


### use netcdf tools

In [178]:
# read a file with the global attributes included and the nc configuration file
global_atts = pd.read_excel('/oa-decadal-climate/work/observations/CARSv2_ancillary/outputNC/cars_global_atts.xlsx', sheet_name='globals')
conf_file_generic = '/tube1/cow074/Documents/cars-v2/notebooks/generate_nc_file_att'
# get the data from the header
dfhead = pd.read_csv(filn, skiprows=range(15, 9999))
names = dfhead.columns

# get the coordinate/depth dimension
depth = df.loc[df['PARAMETER'].str.contains('Depth'), 'VALUE']
# get the other coordinates
lat = dfhead.loc[dfhead[names[0]].str.contains(vardict2['LATITUDE']), names[1]].item()
lon = dfhead.loc[dfhead[names[0]].str.contains(vardict2['LONGITUDE']), names[1]].item()
time = dt.datetime.strptime(dfhead.loc[dfhead[names[0]].str.contains(vardict2['TIME']), names[1]].item(), '%d-%m-%Y')

# create a netcdf object and write depth,time,lat,long to it:
with Dataset('testNC.nc', 'w', format='NETCDF4') as output_netcdf_obj:
    # first create our DEPTH dimension and variable
    output_netcdf_obj.createDimension("DEPTH", depth.size)
    output_netcdf_obj.createVariable("DEPTH", "f", "DEPTH")
    output_netcdf_obj['DEPTH'][:] = depth
    # and lat/lon/time vars which come from the header in the csv file:
    output_netcdf_obj.createVariable('TIME','d', fill_value=get_imos_parameter_info('TIME', '_FillValue'))

    output_netcdf_obj.createVariable("LATITUDE", "f", fill_value=get_imos_parameter_info('LATITUDE', '_FillValue'))
    output_netcdf_obj['LATITUDE'][:] = lat
    output_netcdf_obj.createVariable("LONGITUDE", "f", fill_value=get_imos_parameter_info('LONGITUDE', '_FillValue'))
    output_netcdf_obj['LONGITUDE'][:] = lon  

    # now all the other variables
    for group in df['PARAMETER'].unique():
        data = np.ma.masked_invalid(dfgroup.get_group(group)['VALUE'])
        flag = np.ma.masked_invalid(dfgroup.get_group(group)['QAQC_FLAG'])
        for value in vardict:
            if value in group:
                name = vardict[value]
                stdname = stddict[value]
                #create the variable & QC variable:
                output_netcdf_obj.createVariable(name, "f", ["DEPTH"], 
                            fill_value=get_imos_parameter_info(name, '_FillValue'))
                output_netcdf_obj.createVariable(name + '_quality_control', "b", ["DEPTH"], 
                            fill_value=99)

                # output the data
                output_netcdf_obj[name][:] = data
                output_netcdf_obj[name + '_quality_control'][:] = flag
    #generate all the attributes for the variables & the global attributes too
    generate_netcdf_att(output_netcdf_obj, conf_file_generic, conf_file_point_of_truth=True)

    time_val_dateobj = date2num(time, output_netcdf_obj['TIME'].units, output_netcdf_obj['TIME'].calendar)
    output_netcdf_obj['TIME'][:] = time_val_dateobj
    
    #global attributes from header
    for value in globdict:
        if dfhead[names[0]].str.contains(value).any():
            var = dfhead.loc[dfhead[names[0]].str.contains(value), names[1]].item()
            setattr(output_netcdf_obj, globdict[value], var) 
    datt = dt.datetime.strptime(dfhead.loc[dfhead[names[0]].str.contains('FILE CREATED'),names[1]].item(),'%d-%m-%Y')
    setattr(output_netcdf_obj, 'date_created', datt.strftime("%Y-%m-%dT%H:%M:%SZ"))
    setattr(output_netcdf_obj, 'date_modified', dt.datetime.now().strftime("%Y-%m-%dT%H:%M:%SZ"))

In [175]:
datt = dfhead.loc[dfhead[names[0]].str.contains('FILE CREATED'),names[1]].item()
dt.datetime.strptime(datt,'%d-%m-%Y')

datetime.datetime(2022, 11, 18, 0, 0)

In [134]:
ti = dt.datetime.strptime(dfhead.loc[dfhead[names[0]].str.contains(vardict2['TIME']), names[1]].item(), '%d-%m-%Y')
ti.strftime('%Y%m%dT%H%M%SZ')

'20110226T000000Z'

In [92]:
df['PARAMETER'].unique()

array(['%Trans', 'C', 'Chl (f)', 'Depth (m)', 'Latitude (deg N)',
       'Longitude (deg E)', 'OBS (NTU)', 'Oxygen (µmol/kg)', 'PAR(%)',
       'Pressure (dB)', 'Salinity', 'Sigma-t (Kg/m3)', 'Temp(°C)',
       'Water depth (m)'], dtype=object)

## Using XARRAY - not getting what I want, seems to be set up for using time as a dimension

In [23]:
testds = xr.Dataset({
    'DEPTH': xr.DataArray(
        data = depth,
        dims={'DEPTH': depth},
        attrs={'standard_name': 'depth', 'long_name': 'depth', 'units': 'm' }
    )
    })
testds

In [52]:
# get the data from the header
dfhead = pd.read_csv(filn, skiprows=range(15, 9999))
names = dfhead.columns

# get the coordinate/depth dimension
depth = df.loc[df['PARAMETER'].str.contains('Depth'), 'VALUE']
# get the other coordinates
lat = dfhead.loc[dfhead[names[0]].str.contains(vardict2['LATITUDE']), names[1]].item()
lon = dfhead.loc[dfhead[names[0]].str.contains(vardict2['LONGITUDE']), names[1]].item()
time = pd.to_datetime([dfhead.loc[dfhead[names[0]].str.contains(vardict2['TIME']), names[1]].item()],
                     utc=True,dayfirst=True)
# make the dataset with depth dimension
testds = xr.Dataset({
    'DEPTH': xr.DataArray(
        data = depth,
        dims={'DEPTH': depth},
        attrs={'standard_name': 'depth', 'long_name': 'depth', 'units': 'm' }
    ),
    'TIME': xr.DataArray(
        data = time,
        attrs={'standard_name': 'time', 'long_name': 'time', 'units': 'days since 1950-01-01T00:00:00Z' }
    ),    
    'LATITUDE': xr.DataArray(
        data = lat,
        attrs={'standard_name': 'latitude', 'long_name': 'latitude', 'units': 'degrees_north' }
    ),    
    'LONGITUDE': xr.DataArray(
        data = lon,
        attrs={'standard_name': 'longitude', 'long_name': 'longitude', 'units': 'degrees_east' }
    )
    })

# get all the variables
for group in df['PARAMETER'].unique():
    data = dfgroup.get_group(group)['VALUE']
    flag = dfgroup.get_group(group)['QAQC_FLAG']
    dim = list(data)
    for value in vardict:
        if value in group:
            name = vardict[value]
            stdname = stddict[value]
            # add each data array to the dataset:
            testds[name] = xr.DataArray(
                list(data), dims={'DEPTH'},
                attrs={'long_name': stdname[0], 'standard_name': stdname[0], 
                       'units': stdname[1][0],'coordinates': 'TIME LATITUDE LONGITUDE DEPTH',
                      'valid_min': stdname[1][1][0],'valid_max':stdname[1][1][1]}
                )
my_encoding = {
    'DEPTH': {
        'dtype': 'float32',
        '_FillValue': None
        },
    'TIME': {
        'dtype': 'int32',
        '_FillValue': None
        },
    'LATITUDE': {
        'dtype': 'int32',
        '_FillValue': None
        },
    'LONGITUDE': {
        'dtype': 'int32',
        '_FillValue': None
        },
    'DOXY': {
        'dtype': 'float32',
        '_FillValue': 999999,
        'zlib': False
        },
    'TEMP': {
        'dtype': 'float32',
        '_FillValue': 999999,
        'zlib': False
        },
    'PRES_REL': {
        'dtype': 'float32',
        '_FillValue': 999999,
        'zlib': False
        },
    'PSAL': {
        'dtype': 'float32',
        '_FillValue': 999999,
        'zlib': False
        }
}

ValueError: different number of dimensions on data and dims: 1 vs 0

In [51]:
testds #= testds.drop_dims('dim_0')

In [48]:
testds.to_netcdf('test2.nc','w', encoding=my_encoding)

KeyError: 'TIME'

## Attempts at the WOD ragged array format follow

In [511]:
# get the data from the header
dfhead = pd.read_csv(filn, skiprows=range(15, 9999))
names = dfhead.columns
stn = dfhead.loc[dfhead[names[0]].str.contains(castdict2['originators_station_identifier']), names[1]].item()

# get the coordinates
lat = dfhead.loc[dfhead[names[0]].str.contains(castdict2['lat']), names[1]].item()
lon = dfhead.loc[dfhead[names[0]].str.contains(castdict2['lon']), names[1]].item()
time = pd.to_datetime([dfhead.loc[dfhead[names[0]].str.contains(castdict2['date']), names[1]].item()],
                     utc=True,dayfirst=True)
strnlen = 170

#get the z coordinate now
data = dfgroup.get_group(df['PARAMETER'].unique()[0])['VALUE']
z = len(data)

# make the dataset with lat/lon etc
testds = xr.Dataset({
    'time': xr.DataArray(
        data = time,
        dims={'casts': z},
        attrs={'standard_name': 'time', 'long_name': 'time', 'units': 'days since 1770-01-01T00:00:00Z' }
    ),
    'lat': xr.DataArray(
        data = time,
        dims={'casts': z},
        attrs={'standard_name': 'latitude', 'long_name': 'latitude', 'units': 'degrees_north' }
    ),
    'lon': xr.DataArray(
        data = time,
        dims={'casts': z},
        attrs={'standard_name': 'longitude', 'long_name': 'longitude', 'units': 'degrees_east' }
    )
    })

#assign the originators station id
testds['originators_station_identifier'] = xr.DataArray(
    [stn], dims={'casts': z},
    attrs={'long_name': 'originators_station_identifier'}
    ).astype('str')

# assign the time variable - when this is included, it also creates a variable for casts in the output file
# makes casts a coordinate and an index. Can't be removed??
#testds['time'] = xr.DataArray(
#    time, dims={'casts': z},
#    attrs={'standard_name': 'time', 'long_name': 'time', 'units': 'days since 1770-01-01 00:00:00 UTC' }
#    ).astype(np.double)

# get all the variables
for group in df['PARAMETER'].unique():
    data = dfgroup.get_group(group)['VALUE']
    flag = dfgroup.get_group(group)['QAQC_FLAG']
    dim = list(data)
    for value in castdict:
        if value in group:
            name = castdict[value]
            stdname = stddict[value]
            # add each data array to the dataset:
            testds[name] = xr.DataArray(
                list(data), dims={name + '_obs'},
                attrs={'long_name': stdname[0], 'standard_name': stdname[0], 
                       'units': stdname[1], 'ancillary_variables': name + '_sigfigs '
                      + name + '_WODflag ' + name + '_WODprofileflag ' + name + '_origflag '}
                ).astype('float32')
            testds[name + '_sigfigs'] = xr.DataArray(
                [len(str(list(data)[0]))-1]*len(data), dims={name + '_obs'},
                attrs={'long_name': stdname[0] + ' significant_figures'}
                ).astype('int8')
            testds[name + '_row_size'] = xr.DataArray(
                [z], dims={'casts': z},
                attrs={'long_name': 'number of ' + name + ' observations for this cast',
                      'sample_dimension': name + '_obs'}
                ).astype('int32') 
            testds[name + '_origflag'] = xr.DataArray(
                list(flag), dims={name + '_obs'},
                attrs={'standard_name': name + ' status_flag', 'comment': 'Originator flags are dependent on origflagset'}
                ).astype('float32')
            if name == 'Temperature' or name == 'Salinity':
                testds[name + '_Scale'] = xr.DataArray(
                    [''], dims={'casts': z},
                    attrs={'long_name': 'Scale upon which values were measured'}
                    ).astype('str')
            if not name == 'Pressure':
                testds[name + '_WODflag'] = xr.DataArray(
                    np.zeros(len(data)), dims={name + '_obs'},
                    attrs={'long_name': 'WOD_profile_flag', 'flag_values': [0,1,2,3,4,5,6,7,8,9],
                    'flag_meanings': 'accepted range_out inversion gradient anomaly gradient+inversion range+inversion range+gradient range+anomaly range+inversion+gradient'},
                    ).astype('int8')
            if not name == 'z' and not name == 'Pressure':
                testds[name + '_WODprofileflag'] = xr.DataArray(
                    [0], dims={'casts': z},
                    attrs={'long_name': 'WOD_profile_flag', 'flag_values': [0,1,2,3,4,5,6,7,8,9],
                    'flag_meanings': 'accepted annual_sd_out density_inversion cruise seasonal_sd_out monthly_sd_out annual+seasonal_sd_out anomaly_or_annual+monthly_sd_out seasonal+monthly_sd_out annual+seasonal+monthly_sd_out'},
                    ).astype('int8')


In [512]:
testds

In [466]:
time

DatetimeIndex(['2011-02-26 00:00:00+00:00'], dtype='datetime64[ns, UTC]', freq=None)

In [513]:
# can we add the coordinates now?
testds.reset_coords(drop=True)
testds.assign_coords({'time': time, 'lat': lat, 'lon': lon, 'z':z})

In [457]:
# try writing it out as a netcdf and see what it looks like
outf = '/oa-decadal-climate/work/observations/CARSv2_ancillary/AIMSctd.nc'
testds.to_netcdf(outf,'w','NETCDF4')

# fill values aren't set correctly, remove the .fillna() requirements from the creation. Just have NAN in fillvalue

In [469]:
# what happens when I read in what I just wrote out?
ds = xr.open_dataset(outf, decode_times=True, engine='netcdf4')
ds

ValueError: Failed to decode variable 'time': unable to decode time units 'days since 1770-01-01 00:00:00 UTC' with 'the default calendar'. Try opening your dataset with decode_times=False or installing cftime if it is not installed.

In [245]:
filelist = Path(AIMS_data_path).rglob('*.csv')# read the csv file
dataall = []
casts = 0
for filn in filelist:
    datalist = []
    #print(filn)
    # read each aims file
    # let's grab the header information
    dfhead = pd.read_csv(filn, skiprows=range(15, 9999))
    names = dfhead.columns
    # and the data
    df = pd.read_csv(filn, skiprows=15)#let's loop it over all the relevant parameters and add other info for the WOD format
    dfgroup = df.groupby('PARAMETER')

    for group in df['PARAMETER'].unique():
        for value in castdict:
            if value in group:
                dat = dfgroup.get_group(group)
                dat = dat.set_index('DEPTH')
                dat = dat.drop(['PARAMETER','QAQC_VALUE'],axis=1)
                dat = dat.rename(columns={'VALUE':castdict[value],'QAQC_FLAG':castdict[value] + '_flag'}) 
                datalist.append(dat)
    data = pd.concat(datalist, axis=1)
    # get the data from the header
    for value in castdict:
        if dfhead[names[0]].str.contains(value).any():
            df2 = dfhead.loc[dfhead[names[0]].str.contains(value), names[1]].item()
            data.insert(0,castdict[value],df2)
        
    #drop nan rows where no temperature value
    data = data.dropna(subset=['Temperature'])
    # reset the data index
    data = data.reset_index()

    #make an xarray and append
    ds = data.to_xarray()
    dataall.append(ds)
    casts = casts + 1

In [246]:
# combine all the files
dsall = xr.concat(dataall, dim='index')

In [247]:
dsall

In [251]:
casts

2396

In [250]:
# can we add the cast index from the count?
c = xr.DataArray(casts, ['casts', casts])
dsnew = dsall.expand_dims(casts = c)
dsnew

ValueError: IndexVariable objects must be 1-dimensional

In [231]:
dsall['Oxygen_obs'] = dsall['Oxygen'].count()


In [232]:
dsall

In [244]:
# see if we can get a casts index
dsall['originators_station_identifier'].groupby(dsall['originators_station_identifier']).count()

In [236]:
casts