# Convert .csv to .nc

Netcdf instructions:
- CF Conventions: http://cfconventions.org/
- CF Standard names: http://cfconventions.org/Data/cf-standard-names/current/build/cf-standard-name-table.html
- Oak Ridge National Lab guide: https://daac.ornl.gov/submit/netcdfrequirements/


In [1]:
import os
import sys
import shutil
import xarray as xr
import pandas as pd
from pathlib import Path
from datetime import datetime
sys.path.append(str(Path().absolute().parent))
import python_cs_functions as cs

### Config handling

In [2]:
# Specify where the config file can be found
config_file = '../0_config/config.txt'

In [3]:
# Get the required info from the config file
data_path     = cs.read_from_config(config_file,'data_path')

# CAMELS-spat metadata
cs_meta_path  = cs.read_from_config(config_file,'cs_basin_path')
cs_meta_name  = cs.read_from_config(config_file,'cs_meta_name')
cs_unusable_name = cs.read_from_config(config_file,'cs_unusable_name')

# Basin folder
cs_basin_folder = cs.read_from_config(config_file, 'cs_basin_path')
basins_path = Path(data_path) / cs_basin_folder

### Data loading

In [4]:
# CAMELS-spat metadata file
cs_meta_path = Path(data_path) / cs_meta_path
cs_meta = pd.read_csv(cs_meta_path / cs_meta_name)

In [5]:
# Open list of unusable stations; Enforce reading IDs as string to keep leading 0's
cs_unusable = pd.read_csv(cs_meta_path / cs_unusable_name, dtype={'Station_id': object}) 

## Processing
### NetCDF requirements:

#### Time variable
Needs to have attributes:
- `standard_name` = `time`
- `units`: CF conventions (e.g. `unit since date`)
- `calendar`: http://cfconventions.org/Data/cf-conventions/cf-conventions-1.7/cf-conventions.html#calendar
- `bounds` = `time_bnds`; a separate 2-dimensional variable that defines the start and end time points of each measurement. Variable "time_bnds" has the same attributes "units" and "calendar" as variable "time".

#### Data variables
Need to have attributes:
- `units`: (UDUNITS-2) recommended
- `long_name`: description of variable
- `_FillValue`: ?

#### Global attributes
- `title`: data set name
- `institution`: specifies where the original data was produced (USGS, WSC)
- `source`: way data was derived?
- `references`: USGS or WSC
- `history`: audit trail for modifications to original data
- `comment`: _optional_

### Variables and attributes to be included
- var: `time`
- var: `time_bnds`
- var: `q_obs`
- var: `quality`
- att: `country`
- att: `station_id`
- att: `station_name`

In [1]:
# loop over basins, 
# - if basin is not in 'unusable'
# -   load the flow data into pandas
# -   convert to xarray/netcdf
# -   (re)move the .csv file

In [68]:
for ix,row in cs_meta.iterrows():
    
    if ix != 0:
        continue
    
    # Skip over metadata entries we cannot use
    if row.Station_id in cs_unusable['Station_id'].values:
        continue # to next row
        
    # 1. Get paths, etc
    site, _, _, csv_path, _, nc_path = cs.prepare_flow_download_outputs(cs_meta, ix, basins_path)
    csv_hour_path = Path(str(csv_path).replace('_raw.csv','_hourly.csv'))
    print(f'{ix: >3}. Now working on {site}') #: converting \n{csv_hour_path} to \n{nc_path}')
    
    # 2. Load the csv
    csv = prep_country_csv_for_netcdf(csv_hour_path,row.Country)
    
    # 3. Convert to netcdf and save
    nc = flow_csv_to_netcdf(csv, nc_path, row.Country, site)
        

## DEV

In [6]:
# test data
file_can = 'C:/Globus endpoint/CAMELS_spat/camels-spat-data/basin_data/CAN_01AD003/observations/CAN_01AD003_flow_observations_hourly.csv'
file_usa = 'C:/Globus endpoint/CAMELS_spat/camels-spat-data/basin_data/USA_01013500/observations/USA_01013500_flow_observations_hourly.csv'

In [75]:
def return_data_quality_flag_meaning(l,country):
    
    '''Loops over a list of data quality flags and returns a list with the appropriate flag meanings'''
    
    # Select the correct dictionary
    if country.lower() == 'usa':
        standards = {'nan'    : 'Unknown',
                     'A:[0]'  : 'Undefined',
                     'A:<'    : 'Approved, but reported value known to be inaccurate (real value is lower)',
                     'A:>'    : 'Approved, but reported value known to be inaccurate (real value is higher)',
                     'A:[4]'  : 'Approved, but with Incomplete or Partial Aggregated Record',
                     'P:e'    : 'Provisional AND estimated',
                     'P'      : 'Provisional, not approved',
                     'A:R'    : 'Approved, but revised',
                     'A:e'    : 'Approved AND estimated, with unknown data grade code',
                     'A'      : 'Approved, with unknown data grade code',
                     'A:[93]' : 'Approved, with IV verification DV <= 10 percent diff',
                     'A:[92]' : 'Approved, with IV verification DV <= 5 percent diff',
                     'A:[91]' : 'Approved, with IV verification DV <= 1 percent diff',
                     'A:[90]' : 'Approved, with IV verification DV <= 0.01 orig DV = 0'
                    }
    elif country.lower() == 'can':
        standards = {'Provisional/Provisoire:0'     : 'Provisional, flag unknown (not described in WSC docs)',
                     'Provisional/Provisoire:40'    : 'Provisional, dry (water level below sensor)',
                     'Provisional/Provisoire:10'    : 'Provisional, ice-affected',
                     'Provisional/Provisoire:20'    : 'Provisional, estimated',
                     'Provisional/Provisoire:30'    : 'Provisional, partial day (relevant only for daily means)',
                     'Provisional/Provisoire:nan'   : 'Approved, no qualifier specified',
                     'Provisional/Provisoire:-1'    : 'Provisional, no special conditions',
                     'Provisional/Provisoire:50'    : 'Provisional, revised',
                     'Final/Finales:0'              : 'Approved, flag unknown (not described in WSC docs)',
                     'Final/Finales:40'             : 'Approved, dry (water level below sensor)',
                     'Final/Finales:10'             : 'Approved, ice-affected',
                     'Final/Finales:50'             : 'Approved, but revised',
                     'Final/Finales:20'             : 'Approved, but estimated',
                     'Final/Finales:30'             : 'Approved, partial day (relevant only for daily means)',
                     'Final/Finales:nan'            : 'Approved, no qualifier specified',
                     'Final/Finales:-1'             : 'Approved, no special conditions'
                    }
    
    # Map flags onto meanings
    meanings = [standards[item] for item in l]
    
    return meanings

In [34]:
def prep_country_csv_for_netcdf(csv_path,country):
    
    '''Loads a .csv with observed flow data and processes according to the country the data originates from'''
    
    # Load the data
    csv = pd.read_csv(csv_path, index_col=0, parse_dates=True)    
    
    # General processing
    csv.index.name = 'time'
    data_name = 'q_obs'
    flag_name = 'q_obs_data_quality'
    data_conversion = 0.0283168466 # m^3 ft^-3
    
    # Find and rename all auxilliary variables
    csv = csv.rename(colums={'minimum_data_quality': flag_name})
    
    # Country-specific processing
    if country.lower() == 'usa':
        csv = csv.rename(columns={'obs_00060': data_name})
        
        # Unit conversion
        # We know all the USGS data is in cubic feet per second, because we checked this in 1b_usa_flow_obs_to_utc.ipynb
        print('Warning: converting data from units feet^3 s^-1 to m^3 s^-1')
        csv[data_name] = csv[data_name] * data_conversion
        
    elif country.lower() == 'can':
        csv = csv.rename(columns={'Value/Valeur': data_name})
        # Data is already in m3/s. 
        # See (page 3): https://collaboration.cmc.ec.gc.ca/cmc/hydrometrics/www/Document/WebService_Guidelines.pdf
        
    # Check that all data values are derived from observations, and proceed if so
    assert all(csv['based_on_obs'][csv['q_obs'].notna()] == 1), f'Not all data values in {csv_path} are derived from observations. Aborting.'
    csv = csv.drop(columns=['based_on_obs'])
    
    return csv

In [86]:
def flow_csv_to_netcdf(csv, nc_path, country, station):
    
    '''Converts a standardized csv file with flow observations to xarray data set and saves as netcdf'''
    
    # 1. Define standard values
    # -------------------------
    
    # Auxiliary
    global_att_countries = ['USA', 'CAN', 'MEX']
    global_att_i = global_att_countries.index(country)
    global_att_now = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    
    # Global attributes
    global_att_ttl = 'CAMELS-spat streamflow data'
    global_att_con = 'CF-1.10'
    global_att_src = 'Streamflow derived from observed water levels'
    global_att_ins = ['United States Geological Survey',
                      'Water Survey of Canada']
    global_att_ref = [('U.S. Geological Survey, 2016, National Water Information System data available ' +
                       'on the World Wide Web (USGS Water Data for the Nation), accessed 2023-03-23, at '+
                       'URL [http://waterdata.usgs.gov/nwis/]'),
                      ('Original data extracted from the Environment and Climate Change Canada Real-time' +
                       'Hydrometric Data web site (https://wateroffice.ec.gc.ca/mainmenu/real_time_data_index_e.html)' + 
                       'on 2023-04-05')]
    global_att_his = (f'{global_att_now} | File prepared using CAMELS-spat scripts. See:' + 
                       'https://github.com/CH-Earth/camels-spat')
    global_att_com = 'n/a'
    
    # Data variables
    q_obs_unit = 'm3 s-1'
    q_obs_long = 'observed streamflow values'
    
    # Time settings
    time_unit = 'minutes since 1950-01-01 00:00:00'
    time_cal = 'proleptic_gregorian'
    
    # 2. Create a basic data set to build from
    ds = csv.to_xarray()
    
    # 3. Global attributes
    ds.attrs['title'] = global_att_ttl
    ds.attrs['conventions'] = global_att_con
    ds.attrs['source'] = global_att_src
    ds.attrs['country'] = country
    ds.attrs['station'] = station
    ds.attrs['institution'] = global_att_ins[global_att_i]
    ds.attrs['references'] = global_att_ref[global_att_i]
    ds.attrs['history'] = global_att_his
    #ds.attrs['comment'] = global_att_com

    # 4a. Time attributes (coordinate already exists)
    # NOTE: attributes 'units' and 'calendar' are automatically specified when writing to netcdf
    #       This can be checked by saving to netcdf, and then loading as follows: xr.open_dataset(nc_path, decode_times=False)
    ds.time.attrs['standard_name'] = 'time'
    ds.time.attrs['bounds'] = 'time_bnds'
    ds.time.encoding['units'] = time_unit
    ds.time.encoding['calendar'] = time_cal
        
    # 4b. Time bounds variable
    ds = ds.assign_coords(nbnds=[1,2])
    ds = ds.assign(time_bnds=(['nbnds','time'],
                              [csv.index - pd.Timedelta('30min'), csv.index + pd.Timedelta('30min')]))
    ds.nbnds.attrs['standard_name'] = 'bounds for timestep intervals'
    ds.time_bnds.attrs['long_name'] = 'start and end points of each time step'
    
    # 5. Observed streamflow
    ds.q_obs.attrs['units'] = q_obs_unit
    ds.q_obs.attrs['long_name'] = q_obs_long
    ds.q_obs.attrs['cell_methods'] = 'time:mean' # indicating that values are average values over the timestep
    ds.q_obs.attrs['ancillary_variables'] = 'q_obs_data_quality'
    ## TO DO: add other variables to ancillary_variables list
    
    # 6. Data quality flags
    flags = [str(s) for s in csv['q_obs_data_quality'].unique()]
    flags.sort()
    meanings = return_data_quality_flag_meaning(flags,'USA')
    ds.q_obs_data_quality.attrs['standard_name'] = 'quality_flag'
    ds.q_obs_data_quality.attrs['long_name'] = 'lowest data quality flag listed in the values used to generate an average flow value for each timestep'
    ds.q_obs_data_quality.attrs['flag_values'] = ' '.join([f"'{flag}'" for flag in flags])
    ds.q_obs_data_quality.attrs['flag_meanings'] = ' '.join([f"'{meaning}'" for meaning in meanings])
    
    # 7. Other variables
    
    # Save to file
    ds = ds.drop_indexes(['time','nbnds'])
    ds.to_netcdf(nc_path)
    
    return ds

In [10]:
site, _, _, csv_path, _, nc_path = cs.prepare_flow_download_outputs(cs_meta, 1697, basins_path)

In [36]:
csv = prep_country_csv_for_netcdf(file_usa,'USA')



In [88]:
ds = flow_csv_to_netcdf(csv, nc_path, 'USA', site)
ds

In [17]:
ds.close()

In [18]:
test = xr.open_dataset(nc_path)

In [20]:
test.close()

In [6]:
csv_can = pd.read_csv(file, index_col=0, parse_dates=True)

In [7]:
csv_usa = pd.read_csv(file, index_col=0, parse_dates=True)

In [8]:
print(csv_can.columns)
print(csv_usa.columns)

Index(['Value/Valeur', 'based_on_obs', 'is_ice_affected',
       'is_malfunction_affected', 'is_backwater_affected',
       'is_below_sensor_level', 'minimum_data_quality'],
      dtype='object')
Index(['obs_00060', 'based_on_obs', 'is_ice_affected', 'minimum_data_quality'], dtype='object')
