# Convert .csv to .nc

Netcdf instructions:
- CF Conventions: http://cfconventions.org/
- CF Standard names: http://cfconventions.org/Data/cf-standard-names/current/build/cf-standard-name-table.html
- Oak Ridge National Lab guide: https://daac.ornl.gov/submit/netcdfrequirements/


In [18]:
import os
import sys
import shutil
import xarray as xr
import pandas as pd
from pathlib import Path
from datetime import datetime
sys.path.append(str(Path().absolute().parent))
import python_cs_functions as cs

### Config handling

In [2]:
# Specify where the config file can be found
config_file = '../0_config/config.txt'

In [3]:
# Get the required info from the config file
data_path     = cs.read_from_config(config_file,'data_path')

# CAMELS-spat metadata
cs_meta_path  = cs.read_from_config(config_file,'cs_basin_path')
cs_meta_name  = cs.read_from_config(config_file,'cs_meta_name')
cs_unusable_name = cs.read_from_config(config_file,'cs_unusable_name')

# Basin folder
cs_basin_folder = cs.read_from_config(config_file, 'cs_basin_path')
basins_path = Path(data_path) / cs_basin_folder

### Data loading

In [4]:
# CAMELS-spat metadata file
cs_meta_path = Path(data_path) / cs_meta_path
cs_meta = pd.read_csv(cs_meta_path / cs_meta_name)

In [5]:
# Open list of unusable stations; Enforce reading IDs as string to keep leading 0's
cs_unusable = pd.read_csv(cs_meta_path / cs_unusable_name, dtype={'Station_id': object}) 

## Processing
### NetCDF requirements:

#### Time variable
Needs to have attributes:
- `standard_name` = `time`
- `units`: CF conventions (e.g. `unit since date`)
- `calendar`: http://cfconventions.org/Data/cf-conventions/cf-conventions-1.7/cf-conventions.html#calendar
- `bounds` = `time_bnds`; a separate 2-dimensional variable that defines the start and end time points of each measurement. Variable "time_bnds" has the same attributes "units" and "calendar" as variable "time".

#### Data variables
Need to have attributes:
- `units`: (UDUNITS-2) recommended
- `long_name`: description of variable
- `_FillValue`: ?

#### Global attributes
- `title`: data set name
- `institution`: specifies where the original data was produced (USGS, WSC)
- `source`: way data was derived?
- `references`: USGS or WSC
- `history`: audit trail for modifications to original data
- `comment`: _optional_

### Variables and attributes to be included
- var: `time`
- var: `time_bnds`
- var: `q_obs`
- var: `quality`
- att: `country`
- att: `station_id`
- att: `station_name`

In [1]:
# loop over basins, 
# - if basin is not in 'unusable'
# -   load the flow data into pandas
# -   convert to xarray/netcdf
# -   (re)move the .csv file

In [27]:
# Specify global attributes
global_att_now = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
global_att_ttl = 'CAMELS-spat streamflow data'
global_att_ins = ['United States Geological Survey',
                   'Water Survey of Canada']
global_att_src = 'Streamflow derived from observed water levels'
global_att_ref = ['U.S. Geological Survey, 2016, National Water Information System data available on the World Wide Web \
                   (USGS Water Data for the Nation), accessed 2023-03-23, at URL [http://waterdata.usgs.gov/nwis/]',
                  'Original data extracted from the Environment and Climate Change Canada Real-time Hydrometric Data web \
                   site (https://wateroffice.ec.gc.ca/mainmenu/real_time_data_index_e.html) on 2023-04-05']
global_att_his = f'{global_att_now} File prepared using scripts prepared for CAMELS-spat creation. See: \
                   https://github.com/CH-Earth/camels-spat'
global_att_com = 'n/a'

In [10]:
file = 'C:/Globus endpoint/CAMELS_spat/camels-spat-data/basin_data/CAN_01AD003/observations/CAN_01AD003_flow_observations_hourly.csv'
csv_can = pd.read_csv(file, index_col=0, parse_dates=True)

In [11]:
file = 'C:/Globus endpoint/CAMELS_spat/camels-spat-data/basin_data/USA_01013500/observations/USA_01013500_flow_observations_hourly.csv'
csv_usa = pd.read_csv(file, index_col=0, parse_dates=True)

In [12]:
print(csv_can.columns)
print(csv_usa.columns)

Index(['Value/Valeur', 'based_on_obs', 'is_ice_affected',
       'is_malfunction_affected', 'is_backwater_affected',
       'is_below_sensor_level', 'minimum_data_quality'],
      dtype='object')
Index(['obs_00060', 'based_on_obs', 'is_ice_affected', 'minimum_data_quality'], dtype='object')
