# Merge Metadata

This script can be used to build the final Metadata file.

There are several notes that are important:

* Some of the location files have been processed externally and the workflow needs to be explained here
* We should note done all CRS transformations applied for reference

In [1]:
import os
import json
import pandas as pd
import numpy as np
from tqdm import tqdm
from pyproj.transformer import Transformer

from camelsp import Bundesland, util

  def hasna(x: np.ndarray) -> bool:


As an example: The `Bundesland` context manager can load the metadata for the given Bundesland only from the full metadata table. If this table does not yet exist, it is created from the NUTSID mapping table. Check out for Saarland:

In [2]:
with Bundesland('DEB') as bl:
    dec_meta = bl.metadata

dec_meta

Unnamed: 0,camels_id,provider_id,camels_path,nuts_lvl2,federal_state
1026,DEB10000,2546015800,./DEB/DEB10000/DEB10000_data.csv,DEB,Rheinland-Pfalz
1027,DEB10010,2546030700,./DEB/DEB10010/DEB10010_data.csv,DEB,Rheinland-Pfalz
1028,DEB10020,2546040900,./DEB/DEB10020/DEB10020_data.csv,DEB,Rheinland-Pfalz
1029,DEB10030,2546058800,./DEB/DEB10030/DEB10030_data.csv,DEB,Rheinland-Pfalz
1030,DEB10040,2546070400,./DEB/DEB10040/DEB10040_data.csv,DEB,Rheinland-Pfalz
...,...,...,...,...,...
1145,DEB11190,2679020500,./DEB/DEB11190/DEB11190_data.csv,DEB,Rheinland-Pfalz
1146,DEB11200,2680020700,./DEB/DEB11200/DEB11200_data.csv,DEB,Rheinland-Pfalz
1147,DEB11210,2682050000,./DEB/DEB11210/DEB11210_data.csv,DEB,Rheinland-Pfalz
1148,DEB11220,2683060500,./DEB/DEB11220/DEB11220_data.csv,DEB,Rheinland-Pfalz


## Generate basic metadata

This step will produce one metadata file containing all processed data, which can be used as NUTS lookup and as a basis to add more specific metadata.
The first step also loads the Location files and merges everything

### Add Pegelname, Gewässername and elevation

add the above fields to the merged metadata, if we have the information from the Landesämter.

In [3]:
# lookup dictionary for column names in raw_metadata
_META_DICT = {
    'DE1': {'provider_id': 'Messstellennummer', 'gauge_name': 'Standort', 'waterbody_name': 'Gewässer', 'gauge_elevation': 'Pegelnullpunkt (PNP) in m'},
    'DE2': {'provider_id': 'Stationsnummer', 'gauge_name': 'Stationsname', 'waterbody_name': 'Gewässer (Name|Nummer)', 'gauge_elevation': 'PNP'},
    'DE4': {'provider_id': 'Messstellennummer', 'gauge_name': 'Pegelname', 'waterbody_name': 'Gewaesser', 'gauge_elevation': 'PNP_Höhe'},
    'DE7': {'provider_id': 'Messstellen Nr.', 'gauge_name': 'Pegelname', 'waterbody_name': 'Gewässer', 'gauge_elevation': 'Höhe              [m ü. NN]'},
    'DE8': {'provider_id': 'pegelkennzahl', 'gauge_name': 'bezeichnung', 'waterbody_name': 'gewaesser', 'gauge_elevation': 'pnp'},
    'DE9': {'provider_id': 'MESSSTELLE_NR', 'gauge_name': 'LANGNAME', 'waterbody_name': np.nan, 'gauge_elevation': np.nan},
    'DEA': {'provider_id': 'Stationsnummer', 'gauge_name': 'Station', 'waterbody_name': 'Gewässer', 'gauge_elevation': 'NULLPUNKT'},
    'DEB': {'provider_id': 'Nummer', 'gauge_name': 'Stationsname', 'waterbody_name': 'Gewässer', 'gauge_elevation': 'PNP'},
    'DEC': {'provider_id': 'MSTNR', 'gauge_name': 'Pegelname_', 'waterbody_name': 'Gewässer', 'gauge_elevation': 'PNP'},
    'DED': {'provider_id': 'Pegelkennziffer', 'gauge_name': 'Pegelname', 'waterbody_name': 'Gewaesser', 'gauge_elevation': np.nan},
    'DEE': {'provider_id': 'SANR', 'gauge_name': 'SNAME', 'waterbody_name': 'SWATER', 'gauge_elevation': np.nan},
    'DEF': {'provider_id': 'id', 'gauge_name': 'gauge', 'waterbody_name': 'river', 'gauge_elevation': np.nan},
    'DEG': {'provider_id': 'Pegelnr', 'gauge_name': 'Pegelname', 'waterbody_name': 'Gewässer', 'gauge_elevation': 'PNP'},
}

for NUTS in tqdm(util._NUTS_LVL2_NAMES.keys()):
    with Bundesland(NUTS) as bl:
        try:
            # read raw metadata for bl
            p = os.path.join(bl.base_path, 'raw_metadata', f"{bl.NUTS}_raw_metadata.csv")
            df = pd.read_csv(p)

            # get relevant metadata columns from lookup dict
            cols = list(_META_DICT[NUTS].values())
            # drop nan from cols
            cols = [col for col in cols if not pd.isna(col)]

            # select relevant columns
            df = df[cols]
            
            # rename columns
            df = df.rename(columns=dict((v,k) for k,v in _META_DICT[NUTS].items()))

            # make provider_id a string
            df['provider_id'] = df['provider_id'].astype(str)

            # transform gauge_elevation to float
            if 'gauge_elevation' in df.columns:
                # make sure that gauge_elevation is a string
                df['gauge_elevation'] = df['gauge_elevation'].astype(str)
                
                # replace comma with dot
                df['gauge_elevation'] = df['gauge_elevation'].str.replace(',', '.')

                # remove all non numeric characters
                df['gauge_elevation'] = df['gauge_elevation'].str.extract('([\d.]+)').astype(float)

                # transform to float
                df['gauge_elevation'] = df['gauge_elevation'].astype(float)
        except FileNotFoundError:
            continue
        
        # update metadata
        bl.update_metadata(df, id_column='provider_id')

util.get_metadata()

  0%|          | 0/16 [00:00<?, ?it/s]

100%|██████████| 16/16 [00:00<00:00, 69.22it/s]


Unnamed: 0,provider_id,camels_id,camels_path,nuts_lvl2,federal_state,gauge_name,waterbody_name,gauge_elevation
0,573000,DEG10000,./DEG/DEG10000/DEG10000_data.csv,DEG,Thüringen,Ammern,Unstrut,210.243
1,447000,DEG10010,./DEG/DEG10010/DEG10010_data.csv,DEG,Thüringen,Arenshausen,Leine,196.288
2,574200,DEG10020,./DEG/DEG10020/DEG10020_data.csv,DEG,Thüringen,Arnstadt,Gera,293.577
3,576500,DEG10030,./DEG/DEG10030/DEG10030_data.csv,DEG,Thüringen,Berga,Weiße Elster,218.995
4,570210,DEG10040,./DEG/DEG10040/DEG10040_data.csv,DEG,Thüringen,Blankenstein-Rosenthal,Saale,410.517
...,...,...,...,...,...,...,...,...
2993,76273,DE112470,./DE1/DE112470/DE112470_data.csv,DE1,Baden-Württemberg,Blaubeuren,Blautopf,511.871
2994,76274,DE112480,./DE1/DE112480/DE112480_data.csv,DE1,Baden-Württemberg,Erlenbach,Sulm,160.832
2995,76276,DE112490,./DE1/DE112490/DE112490_data.csv,DE1,Baden-Württemberg,Bolheim,Brenz,473.000
2996,76290,DE112500,./DE1/DE112500/DE112500_data.csv,DE1,Baden-Württemberg,Schweinhausen,Riß,541.098


In [4]:
for NUTS in tqdm(util._NUTS_LVL2_NAMES.keys()):
    with Bundesland(NUTS) as bl:
        try:
            p = os.path.join(bl.base_path, 'locations', f'{bl.NUTS}_Locations.csv')
            # read in 
            df = pd.read_csv(p, dtype={'ID': str})
            df.columns = ['provider_id', 'area', 'x', 'y']
            #df.drop_duplicates(inplace=True)
        except FileNotFoundError:
            continue
        
        # update by simply setting the new metadata to the property setter
        # in this case, the joining column needs to be 'camels_id' or 'provider_id'
        #bl.metadata = df

        # or use the function if you prefer
        bl.update_metadata(df, id_column='provider_id')

metadata = util.get_metadata()
metadata[metadata['nuts_lvl2'] == 'DEB']

100%|██████████| 16/16 [00:00<00:00, 66.41it/s]


Unnamed: 0,provider_id,camels_id,camels_path,nuts_lvl2,federal_state,gauge_name,waterbody_name,gauge_elevation,area,x,y
1026,2546015800,DEB10000,./DEB/DEB10000/DEB10000_data.csv,DEB,Rheinland-Pfalz,Nanzdietschweiler,Glan,215.499,200.94,4.135583e+06,2.929086e+06
1027,2546030700,DEB10010,./DEB/DEB10010/DEB10010_data.csv,DEB,Rheinland-Pfalz,Eschenau,Glan,180.334,598.31,4.139014e+06,2.946132e+06
1028,2546040900,DEB10020,./DEB/DEB10020/DEB10020_data.csv,DEB,Rheinland-Pfalz,Odenbach,Glan,147.750,1088.17,4.151599e+06,2.955622e+06
1029,2546058800,DEB10030,./DEB/DEB10030/DEB10030_data.csv,DEB,Rheinland-Pfalz,Niedermohr,Mohrbach,214.127,100.76,4.137186e+06,2.930547e+06
1030,2546070400,DEB10040,./DEB/DEB10040/DEB10040_data.csv,DEB,Rheinland-Pfalz,Untersulzbach,Lauter,202.390,215.31,4.151824e+06,2.937774e+06
...,...,...,...,...,...,...,...,...,...,...,...
1145,2679020500,DEB11190,./DEB/DEB11190/DEB11190_data.csv,DEB,Rheinland-Pfalz,Traben-Trarbach,Kautenbach,146.310,51.12,4.113870e+06,2.983639e+06
1146,2680020700,DEB11200,./DEB/DEB11200/DEB11200_data.csv,DEB,Rheinland-Pfalz,Saxler Mühle,Alf,394.500,39.85,4.098666e+06,3.007109e+06
1147,2682050000,DEB11210,./DEB/DEB11210/DEB11210_data.csv,DEB,Rheinland-Pfalz,Hasborner Mühle,Sammetbach,283.602,22.70,4.100465e+06,2.999367e+06
1148,2683060500,DEB11220,./DEB/DEB11220/DEB11220_data.csv,DEB,Rheinland-Pfalz,Bengel,Alf,144.047,138.15,4.110353e+06,2.993363e+06


## Add WGS84 coordinates

In [5]:
# create a transformer
transformer = Transformer.from_crs("EPSG:3035", "EPSG:4326", always_xy=True)

# transform
lon, lat = transformer.transform(metadata.x.values, metadata.y.values)

# add back
updates = pd.DataFrame({'camels_id': metadata.camels_id, 'lon': lon, 'lat': lat})
util.update_metadata(new_metadata=updates)

metadata = util.get_metadata()
metadata[metadata['nuts_lvl2'] == 'DEB']

Unnamed: 0,camels_id,provider_id,camels_path,nuts_lvl2,federal_state,gauge_name,waterbody_name,gauge_elevation,area,x,y,lon,lat
1026,DEB10000,2546015800,./DEB/DEB10000/DEB10000_data.csv,DEB,Rheinland-Pfalz,Nanzdietschweiler,Glan,215.499,200.94,4.135583e+06,2.929086e+06,7.443029,49.445642
1027,DEB10010,2546030700,./DEB/DEB10010/DEB10010_data.csv,DEB,Rheinland-Pfalz,Eschenau,Glan,180.334,598.31,4.139014e+06,2.946132e+06,7.482398,49.599891
1028,DEB10020,2546040900,./DEB/DEB10020/DEB10020_data.csv,DEB,Rheinland-Pfalz,Odenbach,Glan,147.750,1088.17,4.151599e+06,2.955622e+06,7.652250,49.688918
1029,DEB10030,2546058800,./DEB/DEB10030/DEB10030_data.csv,DEB,Rheinland-Pfalz,Niedermohr,Mohrbach,214.127,100.76,4.137186e+06,2.930547e+06,7.464438,49.459265
1030,DEB10040,2546070400,./DEB/DEB10040/DEB10040_data.csv,DEB,Rheinland-Pfalz,Untersulzbach,Lauter,202.390,215.31,4.151824e+06,2.937774e+06,7.663110,49.528576
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1145,DEB11190,2679020500,./DEB/DEB11190/DEB11190_data.csv,DEB,Rheinland-Pfalz,Traben-Trarbach,Kautenbach,146.310,51.12,4.113870e+06,2.983639e+06,7.114764,49.928586
1146,DEB11200,2680020700,./DEB/DEB11200/DEB11200_data.csv,DEB,Rheinland-Pfalz,Saxler Mühle,Alf,394.500,39.85,4.098666e+06,3.007109e+06,6.889509,50.133828
1147,DEB11210,2682050000,./DEB/DEB11210/DEB11210_data.csv,DEB,Rheinland-Pfalz,Hasborner Mühle,Sammetbach,283.602,22.70,4.100465e+06,2.999367e+06,6.919144,50.064973
1148,DEB11220,2683060500,./DEB/DEB11220/DEB11220_data.csv,DEB,Rheinland-Pfalz,Bengel,Alf,144.047,138.15,4.110353e+06,2.993363e+06,7.060457,50.014688


## Count existing data

Go for each file and count the available data. Add anything that makes it necessary to read every single data file into the for-loop below.

Checking columns:

* `'q'` 
* `'w'`

In [6]:
for NUTS in util._NUTS_LVL2_NAMES.keys():
    # empty container for this BL
    count_q = []
    count_w = []
    
    # process this federal state
    with Bundesland(NUTS) as bl:
        # get meta
        meta = bl.metadata

        # go for each id
        for camels_id in tqdm(meta.camels_id.values, desc=NUTS):
            # load the data
            try:
                df = bl.get_data(camels_id)
            except FileNotFoundError:
                count_q.append(0)
                count_w.append(0)
                continue
    
            # check q
            if 'q' in df.columns.values:
                count_q.append((~df.q.isna()).count())
            else:
                count_q.append(0)

            # check w
            if 'w' in df.columns.values:
                count_w.append((~df.w.isna()).count())
            else:
                count_w.append(0)

        # build the new metadata
        counts = pd.DataFrame({'camels_id': meta.camels_id.values, 'q_count': np.asarray(count_q, dtype=int), 'w_count': np.asarray(count_w, dtype=int)})

        # add to metadata
        bl.update_metadata(counts)

metadata = util.get_metadata()
metadata[metadata['nuts_lvl2'] == 'DEB']

DE1:   0%|          | 0/252 [00:00<?, ?it/s]

DE1: 100%|██████████| 252/252 [00:05<00:00, 45.57it/s]
DE2: 100%|██████████| 535/535 [00:14<00:00, 36.61it/s]
DE3: 0it [00:00, ?it/s]
DE4: 100%|██████████| 233/233 [00:05<00:00, 43.75it/s]
DE5: 0it [00:00, ?it/s]
DE6: 0it [00:00, ?it/s]
DE7: 100%|██████████| 97/97 [00:02<00:00, 36.88it/s]
DE8: 100%|██████████| 230/230 [00:03<00:00, 75.79it/s]
DE9: 100%|██████████| 282/282 [00:03<00:00, 78.14it/s]
DEA: 100%|██████████| 219/219 [00:04<00:00, 46.47it/s]
DEB: 100%|██████████| 124/124 [00:02<00:00, 48.17it/s]
DEC: 100%|██████████| 46/46 [00:00<00:00, 53.83it/s]
DED: 100%|██████████| 282/282 [00:04<00:00, 65.00it/s]
DEE: 100%|██████████| 126/126 [00:02<00:00, 47.05it/s]
DEF: 100%|██████████| 509/509 [00:05<00:00, 88.92it/s] 
DEG: 100%|██████████| 63/63 [00:02<00:00, 27.84it/s]


Unnamed: 0,camels_id,provider_id,camels_path,nuts_lvl2,federal_state,gauge_name,waterbody_name,gauge_elevation,area,x,y,lon,lat,q_count,w_count
1026,DEB10000,2546015800,./DEB/DEB10000/DEB10000_data.csv,DEB,Rheinland-Pfalz,Nanzdietschweiler,Glan,215.499,200.94,4.135583e+06,2.929086e+06,7.443029,49.445642,15388.0,15388.0
1027,DEB10010,2546030700,./DEB/DEB10010/DEB10010_data.csv,DEB,Rheinland-Pfalz,Eschenau,Glan,180.334,598.31,4.139014e+06,2.946132e+06,7.482398,49.599891,20907.0,20907.0
1028,DEB10020,2546040900,./DEB/DEB10020/DEB10020_data.csv,DEB,Rheinland-Pfalz,Odenbach,Glan,147.750,1088.17,4.151599e+06,2.955622e+06,7.652250,49.688918,25048.0,25048.0
1029,DEB10030,2546058800,./DEB/DEB10030/DEB10030_data.csv,DEB,Rheinland-Pfalz,Niedermohr,Mohrbach,214.127,100.76,4.137186e+06,2.930547e+06,7.464438,49.459265,10013.0,10013.0
1030,DEB10040,2546070400,./DEB/DEB10040/DEB10040_data.csv,DEB,Rheinland-Pfalz,Untersulzbach,Lauter,202.390,215.31,4.151824e+06,2.937774e+06,7.663110,49.528576,26294.0,26294.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1145,DEB11190,2679020500,./DEB/DEB11190/DEB11190_data.csv,DEB,Rheinland-Pfalz,Traben-Trarbach,Kautenbach,146.310,51.12,4.113870e+06,2.983639e+06,7.114764,49.928586,10283.0,10283.0
1146,DEB11200,2680020700,./DEB/DEB11200/DEB11200_data.csv,DEB,Rheinland-Pfalz,Saxler Mühle,Alf,394.500,39.85,4.098666e+06,3.007109e+06,6.889509,50.133828,17955.0,17955.0
1147,DEB11210,2682050000,./DEB/DEB11210/DEB11210_data.csv,DEB,Rheinland-Pfalz,Hasborner Mühle,Sammetbach,283.602,22.70,4.100465e+06,2.999367e+06,6.919144,50.064973,17955.0,17955.0
1148,DEB11220,2683060500,./DEB/DEB11220/DEB11220_data.csv,DEB,Rheinland-Pfalz,Bengel,Alf,144.047,138.15,4.110353e+06,2.993363e+06,7.060457,50.014688,3206.0,3206.0


In [9]:
from glob import glob

# empty_data = []
# empty_files = []

for NUTS in util._NUTS_LVL2_NAMES.keys():

    bl = Bundesland(NUTS)

    print(NUTS)

#     # empty data
#     for id in metadata['camels_id'].values:
#         try:
#             df = bl.get_data(id)
#             if len(df) == 0:
#                 empty_data.append(id)
#         except FileNotFoundError:
#             empty_files.append(id)

    print(f"{len(bl.metadata[(bl.metadata['q_count'] == 0) & (bl.metadata['w_count'] == 0)])}")

    # empty folders
    all_folders = glob(f"{bl.base_path}/{NUTS}/*")

    print("Empty folders:")
    print(f"{len([folder for folder in all_folders if len(os.listdir(folder)) == 0])}\n")

DE1
0
Empty folders:
0

DE2
0
Empty folders:
0

DE3
0
Empty folders:
0

DE4
0
Empty folders:
0

DE5
0
Empty folders:
0

DE6
0
Empty folders:
0

DE7
0
Empty folders:
0

DE8
0
Empty folders:
0

DE9
0
Empty folders:
0

DEA
0
Empty folders:
0

DEB
0
Empty folders:
0

DEC
0
Empty folders:
0

DED
0
Empty folders:
0

DEE
0
Empty folders:
0

DEF
0
Empty folders:
0

DEG
0
Empty folders:
0

