In [1]:
import numpy as np
from pathlib import Path, PosixPath
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime
import xarray as xr

Path to dataset

In [2]:
path = PosixPath('/media/davidhaasnoot/files/large_data_sets/CAMELS')

In [3]:
path

PosixPath('/media/davidhaasnoot/files/large_data_sets/CAMELS')

# load characteristics 

In [4]:
indicator_files = list(path.glob('*.txt'))

In [5]:
df_clim_indicators = pd.read_csv(indicator_files[0],delimiter=";")
df_geol_indicators = pd.read_csv(indicator_files[1],delimiter=";")
df_hydro_indicators = pd.read_csv(indicator_files[2],delimiter=";")
df_catchment_names = pd.read_csv(indicator_files[3],delimiter=";")
df_soil_indicators = pd.read_csv(indicator_files[4],delimiter=";")
df_topo_indicators = pd.read_csv(indicator_files[5],delimiter=";")
df_vege_indicators = pd.read_csv(indicator_files[6],delimiter=";")
df_vege_indicators['dom_land_cover'] = df_vege_indicators['dom_land_cover'].apply(lambda x: x.strip())

In [6]:
df_char_lst = [df_geol_indicators,
                df_clim_indicators,
                df_hydro_indicators,
                df_catchment_names,
                df_soil_indicators,
                df_vege_indicators, 
                df_topo_indicators]
for index, df in enumerate(df_char_lst):
    df_char_lst[index] = df.set_index('gauge_id')

In [7]:
characteristics_all = pd.concat(df_char_lst,axis=1)

## Forcing path

In [8]:
source_lst = ['daymet','maurer','nldas']
source = source_lst[0]
forcing_sub_path_1 = f'basin_timeseries_v1p2_metForcing_obsFlow'
forcing_sub_path_2 = f'basin_dataset_public_v1p2'
forcing_sub_path_3 = f'basin_mean_forcing'

In [9]:
forcing_path = path / forcing_sub_path_1 / forcing_sub_path_2 / forcing_sub_path_3 / source

In [10]:
sub_dirs_forcing = list(forcing_path.glob("*"))

In [11]:
basin_mean_forcing_files = list(sub_dirs_forcing[0].glob("*.txt"))

In [12]:
basin_mean_forcing_file = basin_mean_forcing_files[0]
basin_id = basin_mean_forcing_file.name[:8]
folder_id = basin_mean_forcing_file.parent.name

In [13]:
catchment_char = characteristics_all.loc[int(basin_id)].copy()

In [14]:
len(list(characteristics_all.index))

671

## retrieve alpha

retrieve alpha: not always 1.26

In [15]:
model_ouput_path = path / f'basin_timeseries_v1p2_modelOutput_{source}' / f'model_output_{source}' / 'model_output' /'flow_timeseries'/ f'{source}'

In [16]:
model_ouput_path

PosixPath('/media/davidhaasnoot/files/large_data_sets/CAMELS/basin_timeseries_v1p2_modelOutput_daymet/model_output_daymet/model_output/flow_timeseries/daymet')

In [17]:
parameter_files_path = list((model_ouput_path / f'{folder_id}').glob(f'{basin_id}_*_model_parameters.txt'))

In [18]:
line_start = "pet_coef"
lst_alpha = []
for file in parameter_files_path:
    with open(file,'r') as lines:
        for line in lines:
            if line[:len(line_start)] == line_start:
                lst_alpha.append(float(line.strip().split("\t")[-1]))

In [19]:
alpha = np.array(lst_alpha).mean()

## add streamflow

In [20]:
streamflow_path = path / forcing_sub_path_1 / forcing_sub_path_2 / 'usgs_streamflow'
streamflow_file_path = streamflow_path / f'{folder_id}' / f'{basin_id}_streamflow_qc.txt' 

# Combine all 

In [21]:
from read_camels import from_camels_txt, crop_ds, calc_pet

In [22]:
start_date = "1980-01-01T00:00:00Z"
end_date = "2014-12-31T00:00:00Z"

In [23]:
out_path = path.cwd() / "Output" / source
out_path.mkdir(exist_ok=True)

In [24]:
ds0 = from_camels_txt(basin_mean_forcing_file, basin_id, source, start_date, end_date, out_path, alpha, catchment_char, streamflow_file_path)

## test merge

In [25]:
source_lst = ['daymet','maurer','nldas']
source = source_lst[0]
# Source dependant
forcing_sub_path_1 = f'basin_timeseries_v1p2_metForcing_obsFlow'
forcing_sub_path_2 = f'basin_dataset_public_v1p2'
forcing_sub_path_3 = f'basin_mean_forcing'
forcing_path = path / forcing_sub_path_1 / forcing_sub_path_2 / forcing_sub_path_3 / source
sub_dirs_forcing = list(forcing_path.glob("*"))
model_ouput_path = path / f'basin_timeseries_v1p2_modelOutput_{source}' / f'model_output_{source}' / 'model_output' /'flow_timeseries'/ f'{source}'
streamflow_path = path / forcing_sub_path_1 / forcing_sub_path_2 / 'usgs_streamflow'

In [26]:
# basin dependant
basin_mean_forcing_file = basin_mean_forcing_files[1]
basin_id = basin_mean_forcing_file.name[:8]
folder_id = basin_mean_forcing_file.parent.name
catchment_char = characteristics_all.loc[int(basin_id)].copy()
parameter_files_path = list((model_ouput_path / f'{folder_id}').glob(f'{basin_id}_*_model_parameters.txt'))
line_start = "pet_coef"
lst_alpha = []
for file in parameter_files_path:
    with open(file,'r') as lines:
        for line in lines:
            if line[:len(line_start)] == line_start:
                lst_alpha.append(float(line.strip().split("\t")[-1]))
alpha = np.array(lst_alpha).mean()

streamflow_file_path = streamflow_path / f'{folder_id}' / f'{basin_id}_streamflow_qc.txt' 

In [27]:
ds1 = from_camels_txt(basin_mean_forcing_file, basin_id, source, start_date, end_date, out_path, alpha, catchment_char, streamflow_file_path)

In [28]:
xr.merge([ds0,ds1])

# loop

not found 1150900, 6775500, 6846500 in `catchment_char = characteristics_all.loc[int(basin_id)].copy()`

error in 02108000, 05120500: wrong header daymet

In [29]:
skip_ids = [2081113, 3448942, 9535100, 1150900, 6775500, 6846500]

In [None]:
source_lst = ['daymet','maurer','nldas']
for source in source_lst[1:]:
    # Source dependant
    out_path = path.cwd() / "Output" / source
    out_path.mkdir(exist_ok=True)
    forcing_sub_path_1 = f'basin_timeseries_v1p2_metForcing_obsFlow'
    forcing_sub_path_2 = f'basin_dataset_public_v1p2'
    forcing_sub_path_3 = f'basin_mean_forcing'
    forcing_path = path / forcing_sub_path_1 / forcing_sub_path_2 / forcing_sub_path_3 / source
    sub_dirs_forcing = list(forcing_path.glob("*"))
    model_ouput_path = path / f'basin_timeseries_v1p2_modelOutput_{source}' / f'model_output_{source}' / 'model_output' /'flow_timeseries'/ f'{source}'
    streamflow_path = path / forcing_sub_path_1 / forcing_sub_path_2 / 'usgs_streamflow'
    
    for sub_dirs in sub_dirs_forcing:
        basin_mean_forcing_files = list(sub_dirs.glob("*.txt"))
        for basin_mean_forcing_file in basin_mean_forcing_files:
            basin_id = basin_mean_forcing_file.name[:8]
            if int(basin_id) in skip_ids:
                pass 
            else:
                folder_id = basin_mean_forcing_file.parent.name
                catchment_char = characteristics_all.loc[int(basin_id)].copy()
                parameter_files_path = list((model_ouput_path / f'{folder_id}').glob(f'{basin_id}_*_model_parameters.txt'))
                
                if source == "daymet":
                    line_start = "pet_coef"
                elif source != 'daymet':
                    line_start = "PT_COEF"
                lst_alpha = []    
                for file in parameter_files_path:
                    with open(file,'r') as lines:
                        for line in lines:
                            if line[:len(line_start)] == line_start:
                                lst_alpha.append(float(line.strip().split("\t")[-1]))
                alpha = np.array(lst_alpha).mean()
                
                streamflow_file_path = streamflow_path / f'{folder_id}' / f'{basin_id}_streamflow_qc.txt' 
                ds = from_camels_txt(basin_mean_forcing_file, 
                                     basin_id, 
                                     source, 
                                     start_date, 
                                     end_date, 
                                     out_path, 
                                     alpha, 
                                     catchment_char, 
                                     streamflow_file_path)
                ds.close()

In [46]:
files_daymet = list((path.cwd() / "Output" / "daymet").glob("*nc"))

In [47]:
len(files_daymet)

671

In [48]:
files_maurer = list(( path.cwd() / "Output" / "maurer").glob("*nc"))

In [49]:
len(files_maurer)

156