# Add simulation data to CAMELS-DE

- create folder `timeseries_simulated`
- one .csv file per catchment
- build a file `CAMELS_DE_simulations.csv` with metadata about model runs for each catchment

In [1]:
import pandas as pd
import os
from glob import glob
from camelsp.util import INPUT_PATH

In [2]:
# get camels_ids from hydromet timeseries
camels_ids = [camels_id.split("_")[-1].split(".csv")[0] for camels_id in glob("../output_data/camels_de/timeseries/*.csv")]

# sort camels_ids
camels_ids = sorted(camels_ids)

print(f"Total number of stations in CAMELS-DE v1: {len(camels_ids)}")

Total number of stations in CAMELS-DE v1: 1582


## Create folder

In [3]:
os.makedirs("../output_data/camels_de/timeseries_simulated", exist_ok=True)

## Create .csv files with model results

columns:
* date
* discharge_spec_obs
* discharge_spec_sim_lstm
* discharge_spec_sim_conceptual

In [40]:
# read pickle files
lstm_data = pd.read_pickle(os.path.join(INPUT_PATH, "LSTM_results/test_results.pickle"))
conceptual_data = pd.read_pickle(os.path.join(INPUT_PATH, "HBV_CAMELS_DE/test_results.pickle"))

no_conceptual_data_for_id = []

for id in camels_ids:
    # get lstm data
    if id in lstm_data.keys():
        df_lstm = lstm_data[id]

        # rename columns
        df_lstm = df_lstm.rename(columns={"y_sim": "discharge_spec_sim_lstm", "y_obs": "discharge_spec_obs"})
    else:
        raise ValueError(f"ID {id} not found in LSTM data")
    

    # get conceptual data
    if id in conceptual_data.keys():
        df_conceptual = conceptual_data[id]

        # rename columns
        df_conceptual = df_conceptual.rename(columns={"y_sim": "discharge_spec_sim_conceptual", "y_obs": "discharge_spec_obs"})
    else:
        # build a dataframe with the same structure as the lstm data and fill it with NaN
        df_conceptual = pd.DataFrame(index=df_lstm.index, columns=["discharge_spec_sim_conceptual", "discharge_spec_obs"])
        df_conceptual["discharge_spec_sim_conceptual"] = float("nan")
        df_conceptual["discharge_spec_obs"] = df_lstm["discharge_spec_obs"]
        
        no_conceptual_data_for_id.append(id)

    # check that discharge_spec_obs are almost the same
    if (df_conceptual["discharge_spec_obs"] - df_lstm["discharge_spec_obs"] > 0.00001).any():
        raise ValueError(f"Discharge observations are not the same for ID {id}")

    # merge lstm and conceptual data, only keep one discharge_spec_obs column
    df = pd.concat([df_lstm, df_conceptual.drop(columns="discharge_spec_obs")], axis=1)

    # create the filename
    fname = f"../output_data/camels_de/timeseries_simulated/CAMELS_DE_discharge_sim_{id}.csv"

    # round to 2 decimal places
    df = df.round(2)

    # save to csv
    df.to_csv(fname, index=True, index_label="date")

In [42]:
pet_data = pd.read_pickle(os.path.join(INPUT_PATH, "HBV_CAMELS_DE/pet_hargreaves.pickle"))

pet_data

{'DE110000':             pet(mm/day)
 date                   
 1951-01-01     0.480202
 1951-01-02     0.479638
 1951-01-03     0.424903
 1951-01-04     0.408008
 1951-01-05     0.530281
 ...                 ...
 2020-12-27     0.351084
 2020-12-28     0.437371
 2020-12-29     0.458048
 2020-12-30     0.431803
 2020-12-31     0.418595
 
 [25568 rows x 1 columns],
 'DE110010':             pet(mm/day)
 date                   
 1951-01-01     0.479184
 1951-01-02     0.479522
 1951-01-03     0.424885
 1951-01-04     0.407720
 1951-01-05     0.528254
 ...                 ...
 2020-12-27     0.352041
 2020-12-28     0.437810
 2020-12-29     0.458762
 2020-12-30     0.432541
 2020-12-31     0.419627
 
 [25568 rows x 1 columns],
 'DE110020':             pet(mm/day)
 date                   
 1951-01-01     0.472742
 1951-01-02     0.482286
 1951-01-03     0.432944
 1951-01-04     0.412064
 1951-01-05     0.516626
 ...                 ...
 2020-12-27     0.352385
 2020-12-28     0.443234
 2020-

## Create CAMELS_DE_simulations.csv

In [47]:
# dataframe to store results
df_results = pd.DataFrame(index=camels_ids)

### Count days of `discharge_spec` for each period

- training period: `01.10.1970` to `31.12.1999`
- validation period: `01.10.1965` to `30.09.1970`
- test period: `01.10.2001` to `31.12.2020`

In [48]:
training = ["1970-10-01", "1999-12-31"]
validation = ["1965-10-01", "1970-09-30"]
testing = ["2001-10-01", "2020-12-31"]

# read the timeseries
for id in camels_ids:
    df = pd.read_csv(f"../output_data/camels_de/timeseries/CAMELS_DE_hydromet_timeseries_{id}.csv", index_col=0, parse_dates=True)

    # count number with discharge_spec that are not NaN for each period
    training_count = df.loc[training[0]:training[1], "discharge_spec"].count()
    validation_count = df.loc[validation[0]:validation[1], "discharge_spec"].count()
    testing_count = df.loc[testing[0]:testing[1], "discharge_spec"].count()

    # get ratio of stations with discharge_spec data for each period
    training_perc_complete = training_count / len(df.loc[training[0]:training[1], "discharge_spec"])
    validation_perc_complete = validation_count / len(df.loc[validation[0]:validation[1], "discharge_spec"])
    testing_perc_complete = testing_count / len(df.loc[testing[0]:testing[1], "discharge_spec"])

    # store results
    df_results.loc[id, "training_perc_complete"] = round(training_perc_complete, 2)
    df_results.loc[id, "validation_perc_complete"] = round(validation_perc_complete, 2)
    df_results.loc[id, "testing_perc_complete"] = round(testing_perc_complete, 2)

df_results

Unnamed: 0,training_perc_complete,validation_perc_complete,testing_perc_complete
DE110000,1.00,0.82,1.00
DE110010,0.99,1.00,0.95
DE110020,1.00,1.00,1.00
DE110030,1.00,1.00,1.00
DE110040,1.00,1.00,1.00
...,...,...,...
DEG10580,1.00,1.00,1.00
DEG10590,1.00,1.00,1.00
DEG10600,1.00,1.00,1.00
DEG10610,1.00,1.00,1.00


### Add NSE per catchment
* lstm
* conceptual

In [49]:
df_nse_lstm = pd.read_csv(os.path.join(INPUT_PATH, "LSTM_results/NSE.csv"))
df_nse_conceptual = pd.read_csv(os.path.join(INPUT_PATH, "HBV_CAMELS_DE/HBV_NSE.csv"))

# set index to basin_id
df_nse_lstm = df_nse_lstm.set_index("basin_id")
df_nse_conceptual = df_nse_conceptual.set_index("basin_id")

# rename columns
df_nse_lstm = df_nse_lstm.rename(columns={"NSE": "NSE_lstm"})
df_nse_conceptual = df_nse_conceptual.rename(columns={"NSE": "NSE_conceptual"})

# join the two dataframes
df_results = df_results.join(df_nse_lstm)
df_results = df_results.join(df_nse_conceptual)

df_results

Unnamed: 0,training_perc_complete,validation_perc_complete,testing_perc_complete,NSE_lstm,NSE_conceptual
DE110000,1.00,0.82,1.00,0.931,0.819
DE110010,0.99,1.00,0.95,0.819,-0.026
DE110020,1.00,1.00,1.00,0.936,0.801
DE110030,1.00,1.00,1.00,0.913,0.777
DE110040,1.00,1.00,1.00,0.859,0.602
...,...,...,...,...,...
DEG10580,1.00,1.00,1.00,0.918,0.745
DEG10590,1.00,1.00,1.00,0.953,0.865
DEG10600,1.00,1.00,1.00,0.601,0.515
DEG10610,1.00,1.00,1.00,0.779,0.664


### Save csv

In [76]:
df_results.to_csv("../output_data/camels_de/CAMELS_DE_simulation_benchmark.csv", index_label="gauge_id")