# Convert daily .csv files to .nc

Netcdf instructions:
- CF Conventions: http://cfconventions.org/
- CF Standard names: http://cfconventions.org/Data/cf-standard-names/current/build/cf-standard-name-table.html
- Oak Ridge National Lab guide: https://daac.ornl.gov/submit/netcdfrequirements/


In [1]:
import os
import sys
import shutil
import pandas as pd
from pathlib import Path
sys.path.append(str(Path().absolute().parent))
import python_cs_functions as cs

### Config handling

In [2]:
# Specify where the config file can be found
config_file = '../0_config/config.txt'

In [3]:
# Get the required info from the config file
data_path     = cs.read_from_config(config_file,'data_path')

# CAMELS-spat metadata
cs_meta_path  = cs.read_from_config(config_file,'cs_basin_path')
cs_meta_name  = cs.read_from_config(config_file,'cs_meta_name')
cs_unusable_name = cs.read_from_config(config_file,'cs_unusable_name')

# Basin folder
cs_basin_folder = cs.read_from_config(config_file, 'cs_basin_path')
basins_path = Path(data_path) / cs_basin_folder

### Data loading

In [4]:
# CAMELS-spat metadata file
cs_meta_path = Path(data_path) / cs_meta_path
cs_meta = pd.read_csv(cs_meta_path / cs_meta_name)

In [5]:
# Open list of unusable stations; Enforce reading IDs as string to keep leading 0's
cs_unusable = pd.read_csv(cs_meta_path / cs_unusable_name, dtype={'Station_id': object}) 

## Processing
### NetCDF requirements:

#### Time variable
Needs to have attributes:
- `standard_name` = `time`
- `units`: CF conventions (e.g. `unit since date`)
- `calendar`: http://cfconventions.org/Data/cf-conventions/cf-conventions-1.7/cf-conventions.html#calendar
- `bounds` = `time_bnds`; a separate 2-dimensional variable that defines the start and end time points of each measurement. Variable "time_bnds" has the same attributes "units" and "calendar" as variable "time".

#### Data variables
Need to have attributes:
- `units`: (UDUNITS-2) recommended
- `long_name`: description of variable
- `_FillValue`: ?

#### Global attributes
- `title`: data set name
- `institution`: specifies where the original data was produced (USGS, WSC)
- `source`: way data was derived?
- `references`: USGS or WSC
- `history`: audit trail for modifications to original data
- `comment`: _optional_

### Variables and attributes to be included
- var: `time`
- var: `time_bnds`
- var: `q_obs`
- var: `quality`
- att: `country`
- att: `station_id`
- att: `station_name`

In [6]:
# Determine what to do with raw files
remove_raw    = False # If True: removes the raw file
move_raw      = True # If True: moves raw file to new location specified by move_raw_here
assert not (remove_raw and move_raw), 'remove_raw and move_raw cannot both be True' # this means we can use simple logic later

In [7]:
if move_raw:
    move_raw_here = 'D:/CAMELS_spat' 
    move_raw_path = Path(move_raw_here) / cs_basin_folder / 'basin_data' # Mimic existing data structure

In [8]:
# Subset the cs_unusable list to those entries that indicate station does not have IV values
subset_column = 'Missing'
subset_to = 'dv'
cs_unusable_masked = cs_unusable[cs_unusable[subset_column] == subset_to]

In [10]:
for ix,row in cs_meta.iterrows():
    
    # Skip over metadata entries we cannot use
    if row.Station_id in cs_unusable_masked['Station_id'].values:
        continue # to next row
        
    # 1. Get paths, etc
    site, _, _, csv_path, _, nc_path = cs.prepare_flow_download_outputs(cs_meta, ix, basins_path, time='daily')
    print(f'{ix: >3}. Now working on {site}')
    
    # Resume after interupts
    if not os.path.isfile(csv_path): # If csv file doesn't exist it must have been processed already 
        continue                          #   Note: assumes remove_raw or move_raw are True 
       
    # 2. Load the csv
    csv = cs.prep_daily_country_csv_for_netcdf(csv_path, row.Country, row.dv_flow_obs_timezone)
    
    # 3. Add the time bounds for daily averages in UTC. Steps:
    # - Get the dates as string for later use
    # - Convert timezone string ('EST') to 'UTC-0xxx'
    # - Convert 'UTC-0xxx' to '+0x:00'
    # - Convert 'yy-mm-dd 00:00' to UTC datetime. This is the left bound of the interval
    # - Add 24 hours to the left bound to find the right bound
    csv['datetime']    = csv.index.strftime('%Y-%m-%d 00:00') 
    csv['tz_utc_str']  = csv.apply(lambda row: cs.tz_abbreviation_to_utc( row['q_obs_tz_cd'] ), axis=1)
    csv['tz_td_str']   = csv.apply(lambda row: cs.relative_utc_to_timedelta( row['tz_utc_str'] ), axis=1)
    csv['time_bnds_l'] = csv.apply(lambda row: cs.datetime_str_to_timeaware_datetime( row['datetime'], row['tz_td_str'], 
                                                                                     localize_to_UTC=False ), axis=1)
    csv['time_bnds_r'] = csv.apply(lambda row: row['time_bnds_l'] + pd.Timedelta(hours=24), axis=1)
    
    # 4. Convert to netcdf and save
    ds = cs.daily_flow_csv_to_netcdf(csv, nc_path, row.Country, site)
    ds.close()
    
    # 5. (Re)move the csv file
    if remove_raw:
        os.remove(csv_path)
    if move_raw:
        move_path = move_raw_path / f'{row.Country}_{site}' / 'observations'
        move_path.mkdir(parents=True, exist_ok=True)
        move_file = os.path.basename(csv_path)
        shutil.move(csv_path, move_path/move_file)

  0. Now working on 01AD002
  1. Now working on 01AD003
  2. Now working on 01AE001
  3. Now working on 01AF007
  4. Now working on 01AF009
  5. Now working on 01AJ003
  6. Now working on 01AJ004
  7. Now working on 01AJ010
  8. Now working on 01AK001
  9. Now working on 01AK006
 10. Now working on 01AK007
 11. Now working on 01AL002
 12. Now working on 01AL004
 13. Now working on 01AM001
 14. Now working on 01AN002
 15. Now working on 01AP002
 16. Now working on 01AP004
 17. Now working on 01AP006
 18. Now working on 01AQ001
 19. Now working on 01BC001
 20. Now working on 01BD008
 21. Now working on 01BE001
 22. Now working on 01BG005
 23. Now working on 01BG008
 24. Now working on 01BG009
 25. Now working on 01BH005
 26. Now working on 01BH010
 27. Now working on 01BJ003
 28. Now working on 01BJ007
 29. Now working on 01BJ010
 30. Now working on 01BJ012
 31. Now working on 01BL002
 32. Now working on 01BL003
 33. Now working on 01BO001
 34. Now working on 01BP001
 35. Now working on 

437. Now working on 05HD036
438. Now working on 05HE001
439. Now working on 05HG021
440. Now working on 05HH002
441. Now working on 05HH003
442. Now working on 05JA003
443. Now working on 05JB004
444. Now working on 05JB007
445. Now working on 05JC004
446. Now working on 05JC007
447. Now working on 05JF011
448. Now working on 05JF014
449. Now working on 05JG001
450. Now working on 05JG013
451. Now working on 05JH005
452. Now working on 05JJ009
453. Now working on 05JK008
454. Now working on 05JM010
455. Now working on 05KB003
456. Now working on 05KB006
457. Now working on 05KB011
458. Now working on 05KC001
459. Now working on 05KE005
460. Now working on 05KE007
461. Now working on 05KE010
462. Now working on 05KF001
463. Now working on 05KG007
464. Now working on 05KH007
465. Now working on 05KJ014
466. Now working on 05LA003
467. Now working on 05LB004
468. Now working on 05LB006
469. Now working on 05LB008
470. Now working on 05LB010
471. Now working on 05LC001
472. Now working on 

882. Now working on 08NE039
883. Now working on 08NE074
884. Now working on 08NE077
885. Now working on 08NE087
886. Now working on 08NE110
887. Now working on 08NE114
888. Now working on 08NF001
889. Now working on 08NG065
890. Now working on 08NG076
891. Now working on 08NG077
892. Now working on 08NH005
893. Now working on 08NH006
894. Now working on 08NH007
895. Now working on 08NH016
896. Now working on 08NH032
897. Now working on 08NH084
898. Now working on 08NH115
899. Now working on 08NH119
900. Now working on 08NH120
901. Now working on 08NH130
902. Now working on 08NH132
903. Now working on 08NJ013
904. Now working on 08NJ026
905. Now working on 08NJ061
906. Now working on 08NJ130
907. Now working on 08NJ160
908. Now working on 08NK026
909. Now working on 08NL004
910. Now working on 08NL007
911. Now working on 08NL024
912. Now working on 08NL038
913. Now working on 08NL050
914. Now working on 08NL069
915. Now working on 08NL070
916. Now working on 08NL071
917. Now working on 

1102. Now working on 01613050
1103. Now working on 01620500
1104. Now working on 01632000
1105. Now working on 01632900
1106. Now working on 01634500
1107. Now working on 01638480
1108. Now working on 01639500
1109. Now working on 01644000
1110. Now working on 01658500
1111. Now working on 01664000
1112. Now working on 01666500
1113. Now working on 01667500
1114. Now working on 01669000
1115. Now working on 01669520
1116. Now working on 02011400
1117. Now working on 02011460
1118. Now working on 02013000
1119. Now working on 02014000
1120. Now working on 02015700
1121. Now working on 02016000
1122. Now working on 02017500
1123. Now working on 02018000
1124. Now working on 02027000
1125. Now working on 02027500
1126. Now working on 02028500
1127. Now working on 02038850
1128. Now working on 02046000
1129. Now working on 02051000
1130. Now working on 02051500
1131. Now working on 02053200
1132. Now working on 02053800
1133. Now working on 02055100
1134. Now working on 02056900
1135. Now 

1194. Now working on 02369800
1195. Now working on 02371500
1196. Now working on 02372250
1197. Now working on 02374500
1198. Now working on 02381600
1199. Now working on 02384540
1200. Now working on 02395120
1201. Now working on 02408540
1202. Now working on 02415000
1203. Now working on 02422500
1204. Now working on 02427250
1205. Now working on 02430085
1206. Now working on 02430615
1207. Now working on 02450250
1208. Now working on 02464000
1209. Now working on 02464146
1210. Now working on 02464360
1211. Now working on 02465493
1212. Now working on 02469800
1213. Now working on 02472000
1214. Now working on 02472500
1215. Now working on 02479155
1216. Now working on 02479300
1217. Now working on 02479560
1218. Now working on 02481000
1219. Now working on 02481510
1220. Now working on 03010655
1221. Now working on 03011800
1222. Now working on 03015500
1223. Now working on 03021350
1224. Now working on 03026500
1225. Now working on 03028000
1226. Now working on 03049000
1227. Now 

1286. Now working on 04043050
1287. Now working on 04045500
1288. Now working on 04056500
1289. Now working on 04057510
1290. Now working on 04057800
1291. Now working on 04059500
1292. Now working on 04063700
1293. Now working on 04074950
1294. Now working on 04105700
1295. Now working on 04115265
1296. Now working on 04122200
1297. Now working on 04122500
1298. Now working on 04124000
1299. Now working on 04127918
1300. Now working on 04127997
1301. Now working on 04161580
1302. Now working on 04185000
1303. Now working on 04196800
1304. Now working on 04197100
1305. Now working on 04197170
1306. Now working on 04213000
1307. Now working on 04213075
1308. Now working on 04216418
1309. Now working on 04221000
1310. Now working on 04224775
1311. Now working on 04233000
1312. Now working on 04256000
1313. Now working on 04296000
1314. Now working on 05056000
1315. Now working on 05057000
1316. Now working on 05057200
1317. Now working on 05062500
1318. Now working on 05087500
1319. Now 

1378. Now working on 06408700
1379. Now working on 06409000
1380. Now working on 06431500
1381. Now working on 06440200
1382. Now working on 06441500
1383. Now working on 06447000
1384. Now working on 06447500
1385. Now working on 06450500
1386. Now working on 06452000
1387. Now working on 06453600
1388. Now working on 06464500
1389. Now working on 06468170
1390. Now working on 06468250
1391. Now working on 06470800
1392. Now working on 06477500
1393. Now working on 06479215
1394. Now working on 06479438
1395. Now working on 06601000
1396. Now working on 06614800
1397. Now working on 06622700
1398. Now working on 06623800
1399. Now working on 06632400
1400. Now working on 06746095
1401. Now working on 06784000
1402. Now working on 06803510
1403. Now working on 06803530
1404. Now working on 06814000
1405. Now working on 06847900
1406. Now working on 06853800
1407. Now working on 06876700
1408. Now working on 06878000
1409. Now working on 06879650
1410. Now working on 06885500
1411. Now 

1470. Now working on 08025500
1471. Now working on 08029500
1472. Now working on 08050800
1473. Now working on 08066200
1474. Now working on 08066300
1475. Now working on 08070000
1476. Now working on 08070200
1477. Now working on 08079600
1478. Now working on 08082700
1479. Now working on 08086212
1480. Now working on 08086290
1481. Now working on 08101000
1482. Now working on 08103900
1483. Now working on 08104900
1484. Now working on 08109700
1485. Now working on 08150800
1486. Now working on 08155200
1487. Now working on 08158700
1488. Now working on 08158810
1489. Now working on 08164000
1490. Now working on 08164300
1491. Now working on 08164600
1492. Now working on 08165300
1493. Now working on 08171300
1494. Now working on 08175000
1495. Now working on 08176900
1496. Now working on 08178880
1497. Now working on 08189500
1498. Now working on 08190000
1499. Now working on 08190500
1500. Now working on 08194200
1501. Now working on 08195000
1502. Now working on 08196000
1503. Now 

1562. Now working on 10259200
1563. Now working on 10263500
1564. Now working on 10310500
1565. Now working on 10316500
1566. Now working on 10329500
1567. Now working on 10336645
1568. Now working on 10336660
1569. Now working on 10336740
1570. Now working on 10343500
1571. Now working on 10348850
1572. Now working on 10396000
1573. Now working on 11098000
1574. Now working on 11124500
1575. Now working on 11141280
1576. Now working on 11143000
1577. Now working on 11148900
1578. Now working on 11151300
1579. Now working on 11162500
1580. Now working on 11176400
1581. Now working on 11180500
1582. Now working on 11180960
1583. Now working on 11224500
1584. Now working on 11230500
1585. Now working on 11237500
1586. Now working on 11253310
1587. Now working on 11264500
1588. Now working on 11266500
1589. Now working on 11274500
1590. Now working on 11274630
1591. Now working on 11284400
1592. Now working on 11299600
1593. Now working on 11381500
1594. Now working on 11383500
1595. Now 

1654. Now working on 13018300
1655. Now working on 13023000
1656. Now working on 13083000
1657. Now working on 13161500
1658. Now working on 13235000
1659. Now working on 13240000
1660. Now working on 13310700
1661. Now working on 13313000
1662. Now working on 13331500
1663. Now working on 13337000
1664. Now working on 13338500
1665. Now working on 13340000
1666. Now working on 13340600
1667. Now working on 14020000
1668. Now working on 14092750
1669. Now working on 14096850
1670. Now working on 14137000
1671. Now working on 14138800
1672. Now working on 14138870
1673. Now working on 14138900
1674. Now working on 14139800
1675. Now working on 14141500
1676. Now working on 14154500
1677. Now working on 14158500
1678. Now working on 14158790
1679. Now working on 14166500
1680. Now working on 14182500
1681. Now working on 14185000
1682. Now working on 14185900
1683. Now working on 14187000
1684. Now working on 14216500
1685. Now working on 14222500
1686. Now working on 14236200
1687. Now 

## Run only once - find out which streamflow codes we have in the daily data files

### USA

In [87]:
def remove_strings_starting_with_number(arr):
    return [string for string in arr if (type(string) == str) and (not re.match(r'^\d', str(string)))]

In [88]:
usa_streamflow_codes = []
for ix, row in cs_meta.iterrows():
     if row.Country == 'USA':
        site, _, _, csv_path, _, nc_path = cs.prepare_flow_download_outputs(cs_meta, ix, basins_path, time='daily')
        csv = pd.read_csv(csv_path, index_col=0, parse_dates=True, dtype='object')
        values = csv['obs_00060_00003'].unique()
        values = remove_strings_starting_with_number(values)
        usa_streamflow_codes.append(values)

In [91]:
np.unique(np.concatenate(usa_streamflow_codes))

array(['***', '-0.00', '-0.03', '-0.42', 'Bkw', 'Dis', 'Eqp', 'Ice',
       'Rat'], dtype='<U32')

 - `***`: value unavailale - no need to do anything, will be nan later
 - `-x.`: negative values - no need to do anything, will be nan later
 - `Bkw`: backwater-affected
 - `Dis`: station discontinued - no need to do anything
 - `Eqp`: equipment malfunction
 - `Ice`: ice-affected
 - `Rat`: rating curve being developed - no need to do anything

### CAN

We have already checked the Canadian streamflow codes in `2c_can_daily_flow_obs_from_hydat.ipynb`. For posterity:

In [166]:
cs.find_table_contents(db,'DATA_SYMBOLS',to_screen=True);

('A', 'Partial Day', 'Journée incomplète')
('B', 'Ice Conditions', 'Conditions à glace')
('D', 'Dry', 'Sec')
('E', 'Estimated', 'Estimé')
('S', 'Sample(s) collected this day', 'échantillons prélevés ce jour-là')
