# Add simulation data to CAMELS-DE

- create folder `timeseries_simulated`
- one .csv file per catchment
- build a file `CAMELS_DE_simulations.csv` with metadata about model runs for each catchment

In [1]:
import pandas as pd
import os
from glob import glob
from camelsp.util import INPUT_PATH

In [2]:
# get camels_ids from hydromet timeseries
camels_ids = [camels_id.split("_")[-1].split(".csv")[0] for camels_id in glob("../output_data/camels_de/timeseries/*.csv")]

# sort camels_ids
camels_ids = sorted(camels_ids)

print(f"Total number of stations in CAMELS-DE v1: {len(camels_ids)}")

Total number of stations in CAMELS-DE v1: 0


## Create folder

In [11]:
os.makedirs("../output_data/camels_de/timeseries_simulated", exist_ok=True)

## Create .csv files with model result timeseries

In [3]:
# read pickle
lstm_data = pd.read_pickle(os.path.join(INPUT_PATH, "LSTM_results/test_results.pickle"))

for id, df in lstm_data.items():
    if id not in camels_ids:
        continue

    # rename columns
    df = df.rename(columns={"y_sim": "discharge_spec_sim", "y_obs": "discharge_spec_obs"})

    # create the filename
    fname = f"../output_data/camels_de/timeseries_simulated/CAMELS_DE_discharge_sim_{id}.csv"

    # round to 2 decimal places
    df = df.round(2)

    # save to csv
    df.to_csv(fname, index=True, index_label="date")

## Create CAMELS_DE_simulations.csv

In [53]:
# dataframe to store results
df_results = pd.DataFrame(index=camels_ids)

### Count days of `discharge_spec` for each period

- training period: `01.10.1970` to `31.12.1999`
- validation period: `01.10.1965` to `30.09.1970`
- test period: `01.10.2001` to `31.12.2020`

In [55]:
training = ["1970-10-01", "1999-12-31"]
validation = ["1965-10-01", "1970-09-30"]
testing = ["2001-10-01", "2020-12-31"]

# read the timeseries
for id in camels_ids:
    df = pd.read_csv(f"../output_data/camels_de/timeseries/CAMELS_DE_hydromet_timeseries_{id}.csv", index_col=0, parse_dates=True)

    # count number with discharge_spec that are not NaN for each period
    training_count = df.loc[training[0]:training[1], "discharge_spec"].count()
    validation_count = df.loc[validation[0]:validation[1], "discharge_spec"].count()
    testing_count = df.loc[testing[0]:testing[1], "discharge_spec"].count()

    # get ratio of stations with discharge_spec data for each period
    training_perc_complete = training_count / len(df.loc[training[0]:training[1], "discharge_spec"])
    validation_perc_complete = validation_count / len(df.loc[validation[0]:validation[1], "discharge_spec"])
    testing_perc_complete = testing_count / len(df.loc[testing[0]:testing[1], "discharge_spec"])

    # store results
    df_results.loc[id, "training_perc_complete"] = round(training_perc_complete, 2)
    df_results.loc[id, "validation_perc_complete"] = round(validation_perc_complete, 2)
    df_results.loc[id, "testing_perc_complete"] = round(testing_perc_complete, 2)

df_results

Unnamed: 0,training_perc_complete,validation_perc_complete,testing_perc_complete
DE110000,1.00,1.0,1.0
DE110010,0.99,1.0,1.0
DE110020,1.00,1.0,1.0
DE110030,1.00,1.0,1.0
DE110040,1.00,1.0,1.0
...,...,...,...
DEG10580,1.00,1.0,1.0
DEG10590,1.00,1.0,1.0
DEG10600,1.00,1.0,1.0
DEG10610,1.00,1.0,1.0


### Add NSE per catchment

In [60]:
df_nse = pd.read_csv(os.path.join(INPUT_PATH, "LSTM_results/NSE.csv"))

# set index to basin_id
df_nse = df_nse.set_index("basin_id")

# join the two dataframes
df_results = df_results.join(df_nse)

df_results

Unnamed: 0,training_perc_complete,validation_perc_complete,testing_perc_complete,NSE
DE110000,1.00,1.0,1.0,0.932
DE110010,0.99,1.0,1.0,0.714
DE110020,1.00,1.0,1.0,0.941
DE110030,1.00,1.0,1.0,0.931
DE110040,1.00,1.0,1.0,0.853
...,...,...,...,...
DEG10580,1.00,1.0,1.0,0.913
DEG10590,1.00,1.0,1.0,0.952
DEG10600,1.00,1.0,1.0,0.494
DEG10610,1.00,1.0,1.0,0.785


### Save csv

In [None]:
df_results.to_csv("../output_data/camels_de/CAMELS_DE_simulation_benchmark.csv", index_label="gauge_id")