# CAMELS-DE Dataset Metrics

This notebook should be run, after the dataset has been processed, or changes have been made to the processing.
It will generate a number of metrics for the dataset and store plotly JSONs for the Metrics API

In [1]:
from camelsp import Bundesland, Station, util
import os
import plotly.graph_objects as go 
import plotly.express as px
from tqdm import tqdm
import json
import numpy as np
import pandas as pd
from collections import defaultdict
import warnings

## Main loop

Run over all datasets and calculate some stuff

In [2]:
# container for a flag tree
ids = ["CAMELS-DE"]
labels = ["CAMELS-DE"]
parents = [""]
values = [0]
all_ = 0

file_exists = []    # count existing trees
file_nuts = []      # count label for nuts

# load all metadata 
metadata = util.get_metadata()

for NUTS in util._NUTS_LVL2_NAMES.keys():
    with Bundesland(NUTS) as bl:
        meta = bl.metadata

        # container to count flags
        w = defaultdict(lambda: 0)
        q = defaultdict(lambda: 0)

        # load all datasets
        for camels_id in tqdm(meta.camels_id.values):
            try:
                df = bl.get_data(camels_id)
                file_exists.append(True)
            except Exception as e:
                file_exists.append(False)
                continue
            # add the label for the exists flag
            file_nuts.append(NUTS)
            
            # calculate for w
            if 'w_flag' in df:
                w['NaN'] += df.w_flag.isna().sum()
                w['checked'] += df.w_flag.dropna().sum()
                w['not checked'] += (~(df.w_flag.dropna().astype(bool))).sum()

            # calculate for q
            if 'q_flag' in df:
                q['NaN'] += df.q_flag.isna().sum()
                q['checked'] += df.q_flag.dropna().sum()
                q['not checked'] += (~(df.q_flag.dropna().astype(bool))).sum()

        # ad to container
        name = util._NUTS_LVL2_NAMES[NUTS]

        # first level + second level
        tot_q = sum(list(q.values()))
        tot_w = sum(list(w.values()))
        labels.extend([name, 'discharge', 'waterlevel'])
        ids.extend([name, f'{name}-q', f'{name}-w'])
        parents.extend(['CAMELS-DE', name, name])
        values.extend([tot_q + tot_w, tot_q, tot_w])

        # add third level
        for stat, short in zip([q, w], ['q', 'w']):
            for key, count in stat.items():
                labels.append(key)
                ids.append(f'{name}-{short}-{key}')
                parents.append(f'{name}-{short}')
                values.append(count)

        all_ += tot_q + tot_w

# the the overall measurements
values[0] = all_


  0%|          | 0/252 [00:00<?, ?it/s]

100%|██████████| 252/252 [00:08<00:00, 31.46it/s]
100%|██████████| 535/535 [00:19<00:00, 27.34it/s]
0it [00:00, ?it/s]
100%|██████████| 233/233 [00:08<00:00, 27.92it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
100%|██████████| 97/97 [00:03<00:00, 25.63it/s]
100%|██████████| 229/229 [00:04<00:00, 56.45it/s]
100%|██████████| 261/261 [00:04<00:00, 55.20it/s]
100%|██████████| 219/219 [00:06<00:00, 35.78it/s]
100%|██████████| 124/124 [00:03<00:00, 36.36it/s]
100%|██████████| 46/46 [00:01<00:00, 32.07it/s]
100%|██████████| 178/178 [00:03<00:00, 54.66it/s]
100%|██████████| 126/126 [00:04<00:00, 29.04it/s]
100%|██████████| 507/507 [00:10<00:00, 47.71it/s]
100%|██████████| 63/63 [00:02<00:00, 21.46it/s]


## Sunburst

In [3]:
fig = go.Figure(go.Sunburst(
    ids=ids,
    labels=labels, 
    parents=parents, 
    values=values, 
    branchvalues="total",
    hoverinfo="label+percent parent+percent root+value"
))
fig.update_layout(template='plotly_dark')
fig

In [4]:
# create locations directory if it does not exist
os.makedirs("../output_data/metrics", exist_ok=True)

with open('../output_data/metrics/flags.plotly.json', 'w') as f:
    f.write(fig.to_json())

# add description
with open('../output_data/metrics/flags.description.json', 'w') as f:
    json.dump({
        'title': 'CAMELS-DE quality flags',
        'body': 'The sunburst shows the number of quality flags available on three levels. The first level discriminates by federal state.By clicking on a state, you can filter for this state only. The second level breaks the number of flags measured in that state further down by variable and the third level by the flag. Currently, flags can be True (checked), False (not checked) or NaN if there is no flag information or the measurements are missing',
        'actions': [{'href': 'https://github.com/CAMELS-DE/camelsp/blob/main/scripts/dataset_metrics.ipynb', 'title': 'Resource on Github'}]
    }, f)


## Number of Stations

In [5]:
fig = go.Figure(go.Indicator(
    value=sum(file_exists),
    mode="number+delta",
    title="Number of Data files",
    delta=dict(reference=len(metadata))
))
fig.update_layout(template='plotly_dark')
fig

In [6]:
# create locations directory if it does not exist
os.makedirs("../output_data/metrics", exist_ok=True)

with open('../output_data/metrics/count.plotly.json', 'w') as f:
    f.write(fig.to_json())

# add description
with open('../output_data/metrics/count.description.json', 'w') as f:
    json.dump({
        'title': 'CAMELS-DE data files',
        'body': 'This is the current amount of CAMELS-DE data files, that have been processed. The delta indicator shows the difference to the size of the Metadata table. A negative number indicates, that there are discharge stations for which either no data was provided at all, or the processing failed alltogether.',
        'actions': [{'href': 'https://github.com/CAMELS-DE/camelsp/blob/main/scripts/dataset_metrics.ipynb', 'title': 'Resource on Github'}]
    }, f)

## Existing data files

In [7]:
fig = go.Figure()

for NUTS in util._NUTS_LVL2_NAMES.keys():
    # filter 
    exist = [b for b, label in zip(file_exists, file_nuts) if label==NUTS]
    name = util._NUTS_LVL2_NAMES[NUTS]

    # add traces
    fig.add_trace(go.Bar(x=[sum(exist)], y=[name], orientation='h', marker=dict(color='green'), name='existing files', showlegend=False))
    fig.add_trace(go.Bar(x=[len(exist) - sum(exist)], y=[name], orientation='h', marker=dict(color='red'), name="missing files", showlegend=False))

fig.update_layout(barmode="stack", template='plotly_dark', margin=dict(t=5, b=15))
fig

In [8]:
# create locations directory if it does not exist
os.makedirs("../output_data/metrics", exist_ok=True)

with open('../output_data/metrics/states_count.plotly.json', 'w') as f:
    f.write(fig.to_json())

# add description
with open('../output_data/metrics/states_count.description.json', 'w') as f:
    json.dump({
        'title': 'Data files per state',
        'body': 'The bars show the amount of CAMELS-DE data files, that have been processed. THe green bars show how many data files have been processed, the red bar indicates the number of missing files. Files are missing for discharge stations, if either no data was provided at all, or the processing failed alltogether.',
        'actions': [{'href': 'https://github.com/CAMELS-DE/camelsp/blob/main/scripts/dataset_metrics.ipynb', 'title': 'Resource on Github'}]
    }, f)

## Density plot

In [9]:
nuts = list(util._NUTS_LVL2_NAMES.keys())

# Get the data for all NUTS regions in all Bundesländer
q_list = []
w_list = []
for ID in util._NUTS_LVL2_NAMES.keys():
    with Bundesland(ID) as bl:
        #nuts_ids = bl.nuts_table.nuts_id.values
        
        with warnings.catch_warnings(record=True) as warn:
            for id in tqdm(bl.metadata.camels_id.values, desc=ID):
                try:
                    data = bl.get_data(id)
                    # only keep data whichi si not empty
                    if not data.empty:
                        # Also check if there are duplicates in the index
                        if not data.index.duplicated().any():
                            if 'q' in data.columns:
                                q_list.append(data["q"])
                            if 'w' in data.columns:
                                w_list.append(data["w"])

                except Exception as e:
                    warnings.warn(str(e))

            if len(warn) > 0:
                print(f"There were {len(warn)} warnings (missing data files?).")
                

DE1:   0%|          | 0/252 [00:00<?, ?it/s]

DE1: 100%|██████████| 252/252 [00:07<00:00, 32.71it/s]
DE2: 100%|██████████| 535/535 [00:19<00:00, 26.91it/s]
DE3: 0it [00:00, ?it/s]
DE4: 100%|██████████| 233/233 [00:06<00:00, 38.08it/s]
DE5: 0it [00:00, ?it/s]
DE6: 0it [00:00, ?it/s]
DE7: 100%|██████████| 97/97 [00:02<00:00, 33.45it/s]
DE8: 100%|██████████| 229/229 [00:03<00:00, 63.98it/s]
DE9: 100%|██████████| 261/261 [00:05<00:00, 48.96it/s]
DEA: 100%|██████████| 219/219 [00:05<00:00, 40.72it/s]
DEB: 100%|██████████| 124/124 [00:03<00:00, 38.50it/s]
DEC: 100%|██████████| 46/46 [00:01<00:00, 43.12it/s]
DED: 100%|██████████| 178/178 [00:03<00:00, 50.53it/s]
DEE: 100%|██████████| 126/126 [00:03<00:00, 37.47it/s]
DEF: 100%|██████████| 507/507 [00:07<00:00, 65.63it/s]
DEG: 100%|██████████| 63/63 [00:02<00:00, 26.69it/s]


In [10]:
def merge_series_to_df(series_list):
    """
    Merges a list of pandas series into one dataframe.
    Using index as the key.
    """
    df = pd.concat(series_list, keys=[f's{i+1}' for i in range(len(series_list))], axis=1)
    return df

q_df = merge_series_to_df(q_list)
w_df = merge_series_to_df(w_list)

In [11]:
import plotly.express as px

q_sum = pd.DataFrame(q_df.count(axis=1), columns=['Q gauges'])
w_sum = pd.DataFrame(w_df.count(axis=1), columns=['W gauges'])
 
merge = pd.merge(q_sum, w_sum, left_index=True, right_index=True, how="outer")
fig = px.line(merge)
fig.update_layout(legend=dict(orientation='h'), template='plotly_dark')

fig.show()

with open('../output_data/metrics/gauge_density.plotly.json', 'w') as f:
    f.write(fig.to_json())

# add description
with open('../output_data/metrics/gauge_density.plotly.json', 'w') as f:
    json.dump({
        'title': 'CAMELS-de gauge density over time',
        'body': 'The graph shows the amount of active gauges from the CAMELS-de processing dataset over time on daily resolution for water level and discharge each. Note that this graph is generated from the processing dataset that will change over time.',
        'actions': [{'href': 'https://github.com/CAMELS-DE/camelsp/blob/main/scripts/density.ipynb', 'title': 'Resource on Github'}]
    }, f)


## Map of all stations in Germany, colored according to temporal range

Stations with Q data > 30 years are colored green, stations with less data are colored grey.

In [12]:
# get metadata
meta = util.get_metadata()

# get camels_ids
camels_ids = meta['camels_id'].values

for id in camels_ids:
    # init Station
    s = Station(id)

    # get the data
    df = s.get_data()

    # get extent of date index of q and w
    if 'q' in df.columns:
        q_start, q_end = df['q'].dropna().index.min(), df['q'].dropna().index.max()
        
        # compute extent in years
        q_extent = (q_end - q_start).days / 365
        
        # add to metadata
        meta.loc[meta.camels_id == id, 'q_extent'] = q_extent
    else:
        meta.loc[meta.camels_id == id, 'q_extent'] = np.nan
    
    if 'w' in df.columns:
        w_start, w_end = df['w'].dropna().index.min(), df['w'].dropna().index.max()
        
        # compute extent in years
        w_extent = (w_end - w_start).days / 365
        
        # add to metadata
        meta.loc[meta.camels_id == id, 'w_extent'] = w_extent
    else:
        meta.loc[meta.camels_id == id, 'w_extent'] = np.nan

meta.head()

Unnamed: 0,camels_id,provider_id,camels_path,nuts_lvl2,federal_state,gauge_name,waterbody_name,gauge_elevation,area,x,y,lon,lat,q_count,w_count,q_extent_years,w_extent_years,q_extent,w_extent
0,DEG10000,573000,./DEG/DEG10000/DEG10000_data.csv,DEG,Thüringen,Ammern,Unstrut,210.243,182.7,4352221.0,3124617.0,10.446993,51.231727,29646.0,29646.0,81.219178,32.186301,81.219178,32.186301
1,DEG10010,447000,./DEG/DEG10010/DEG10010_data.csv,DEG,Thüringen,Arenshausen,Leine,196.288,275.0,4318941.0,3140875.0,9.970428,51.378709,22707.0,22707.0,62.208219,59.876712,62.208219,59.876712
2,DEG10020,574200,./DEG/DEG10020/DEG10020_data.csv,DEG,Thüringen,Arnstadt,Gera,293.577,174.7,4386764.0,3077926.0,10.933022,50.809106,35490.0,35490.0,97.230137,32.186301,97.230137,32.186301
3,DEG10030,576500,./DEG/DEG10030/DEG10030_data.csv,DEG,Thüringen,Berga,Weiße Elster,218.995,1383.0,4473276.0,3073272.0,12.157989,50.750857,12845.0,12845.0,31.186301,35.189041,31.186301,35.189041
4,DEG10040,570210,./DEG/DEG10040/DEG10040_data.csv,DEG,Thüringen,Blankenstein-Rosenthal,Saale,410.517,1013.0,4442190.0,3033884.0,11.704738,50.404273,21246.0,21246.0,58.205479,52.032877,58.205479,52.032877


In [13]:
fig = go.Figure()

# Filter meta for q_extent >= 20 years
meta_green = meta[meta['q_extent'] >= 20]

# Filter meta for q_extent < 20 years or NaN
meta_grey = meta[(meta['q_extent'] < 20) | (meta['q_extent'].isna())]

# add grey markers for q_extent < 20 years
fig.add_trace(go.Scattermapbox(
    lon=meta_grey['lon'],
    lat=meta_grey['lat'],
    text=meta_grey['gauge_name'],
    mode='markers',
    marker=go.scattermapbox.Marker(
        size=9,
        color='grey',
        opacity=0.8
    ),
    name=f'< 20 years of Q,<br>n={len(meta_grey)}',
    showlegend=True,
    hovertemplate='%{text}<extra></extra>',  # only show the station text on hover
))

# add green markers for q_extent >= 20 years
fig.add_trace(go.Scattermapbox(
    lon=meta_green['lon'],
    lat=meta_green['lat'],
    text=meta_green['gauge_name'],
    mode='markers',
    marker=go.scattermapbox.Marker(
        size=8,
        color='green',
        opacity=0.8
    ),
    name=f'> 20 years of Q,<br>n={len(meta_green)}',
    showlegend=True,
    hovertemplate='%{text}<extra></extra>',  # only show the station text on hover
))

fig.update_layout(
    mapbox_style="carto-positron",
    mapbox_zoom=5,
    mapbox_center={"lat": 51.1657, "lon": 10.4515},  # Center on Germany
    title_text='Stations',
    margin={"r": 0, "t": 0, "l": 0, "b": 0},
    width=550,
    height=600
)

# style legend
fig.update_layout(
    legend=dict(yanchor="top",
                y=0.99,
                xanchor="left",
                x=0.01, title=None,
                font=dict(
                    family="Courier",
                    size=12,
                    color="black"
                ),
                bgcolor="White",
                bordercolor="Black",
                borderwidth=2))

fig.show()

In [14]:
# create locations directory if it does not exist
os.makedirs("../output_data/metrics", exist_ok=True)

with open('../output_data/metrics/q_extent_map.plotly.json', 'w') as f:
    f.write(fig.to_json())

# add description
with open('../output_data/metrics/q_extent_map.description.json', 'w') as f:
    json.dump({
        'title': 'Map of Stations, colored by extent of Q data',
        'body': 'The map shows the location of all CAMELS-DE stations. The color of the marker indicates the extent of the discharge data in years. The legend shows the number of stations for each color.',
        'actions': [{'href': 'https://github.com/CAMELS-DE/camelsp/blob/main/scripts/dataset_metrics.ipynb', 'title': 'Resource on Github'}]
    }, f)