# Merge Metadata

This script can be used to build the final Metadata file.

There are several notes that are important:

* Some of the location files have been processed externally and the workflow needs to be explained here
* We should note done all CRS transformations applied for reference

In [1]:
import os
import json
import pandas as pd
import numpy as np
from tqdm import tqdm
from pyproj.transformer import Transformer

from camelsp import Bundesland, Station, util

As an example: The `Bundesland` context manager can load the metadata for the given Bundesland only from the full metadata table. If this table does not yet exist, it is created from the NUTSID mapping table. Check out for Saarland:

In [2]:
with Bundesland('DEC') as bl:
    dec_meta = bl.metadata

dec_meta

Unnamed: 0,camels_id,provider_id,camels_path,nuts_lvl2,federal_state
980,DEC10000,1271120,./DEC/DEC10000/DEC10000_data.csv,DEC,Saarland
981,DEC10010,1122120,./DEC/DEC10010/DEC10010_data.csv,DEC,Saarland
982,DEC10020,1251120,./DEC/DEC10020/DEC10020_data.csv,DEC,Saarland
983,DEC10030,1102220,./DEC/DEC10030/DEC10030_data.csv,DEC,Saarland
984,DEC10040,1051110,./DEC/DEC10040/DEC10040_data.csv,DEC,Saarland
985,DEC10050,1062220,./DEC/DEC10050/DEC10050_data.csv,DEC,Saarland
986,DEC10060,1113120,./DEC/DEC10060/DEC10060_data.csv,DEC,Saarland
987,DEC10070,1641120,./DEC/DEC10070/DEC10070_data.csv,DEC,Saarland
988,DEC10080,1401120,./DEC/DEC10080/DEC10080_data.csv,DEC,Saarland
989,DEC10090,1381120,./DEC/DEC10090/DEC10090_data.csv,DEC,Saarland


## Generate basic metadata

This step will produce one metadata file containing all processed data, which can be used as NUTS lookup and as a basis to add more specific metadata.
The first step also loads the Location files and merges everything

### Add Pegelname, Gewässername and elevation

add the above fields to the merged metadata, if we have the information from the Landesämter.

In [3]:
# lookup dictionary for column names in raw_metadata
_META_DICT = {
    'DE1': {'provider_id': 'Messstellennummer', 'gauge_name': 'Standort', 'waterbody_name': 'Gewässer', 'gauge_elevation': 'Pegelnullpunkt (PNP) in m'},
    'DE2': {'provider_id': 'Stationsnummer', 'gauge_name': 'Stationsname', 'waterbody_name': 'Gewässer (Name|Nummer)', 'gauge_elevation': 'PNP'},
    'DE4': {'provider_id': 'Messstellennummer', 'gauge_name': 'Pegelname', 'waterbody_name': 'Gewaesser', 'gauge_elevation': 'PNP_Höhe'},
    'DE7': {'provider_id': 'Messstellen Nr.', 'gauge_name': 'Pegelname', 'waterbody_name': 'Gewässer', 'gauge_elevation': 'Höhe              [m ü. NN]'},
    'DE8': {'provider_id': 'pegelkennzahl', 'gauge_name': 'bezeichnung', 'waterbody_name': 'gewaesser', 'gauge_elevation': 'pnp'},
    'DE9': {'provider_id': 'MESSSTELLE_NR', 'gauge_name': 'LANGNAME', 'waterbody_name': np.nan, 'gauge_elevation': np.nan},
    'DEA': {'provider_id': 'Stationsnummer', 'gauge_name': 'Station', 'waterbody_name': 'Gewässer', 'gauge_elevation': 'NULLPUNKT'},
    'DEB': {'provider_id': 'Nummer', 'gauge_name': 'Stationsname', 'waterbody_name': 'Gewässer', 'gauge_elevation': 'PNP'},
    'DEC': {'provider_id': 'MSTNR', 'gauge_name': 'Pegelname_', 'waterbody_name': 'Gewässer', 'gauge_elevation': 'PNP'},
    'DED': {'provider_id': 'Pegelkennziffer', 'gauge_name': 'Pegelname', 'waterbody_name': 'Gewaesser', 'gauge_elevation': np.nan},
    'DEE': {'provider_id': 'SANR', 'gauge_name': 'SNAME', 'waterbody_name': 'SWATER', 'gauge_elevation': np.nan},
    'DEF': {'provider_id': 'id', 'gauge_name': 'gauge', 'waterbody_name': 'river', 'gauge_elevation': np.nan},
    'DEG': {'provider_id': 'Pegelnr', 'gauge_name': 'Pegelname', 'waterbody_name': 'Gewässer', 'gauge_elevation': 'PNP'},
}

for NUTS in tqdm(util._NUTS_LVL2_NAMES.keys()):
    with Bundesland(NUTS) as bl:
        try:
            # read raw metadata for bl
            p = os.path.join(bl.base_path, 'raw_metadata', f"{bl.NUTS}_raw_metadata.csv")
            df = pd.read_csv(p)

            # get relevant metadata columns from lookup dict
            cols = list(_META_DICT[NUTS].values())
            # drop nan from cols
            cols = [col for col in cols if not pd.isna(col)]

            # select relevant columns
            df = df[cols]
            
            # rename columns
            df = df.rename(columns=dict((v,k) for k,v in _META_DICT[NUTS].items()))

            # make provider_id a string
            df['provider_id'] = df['provider_id'].astype(str)

            # transform gauge_elevation to float
            if 'gauge_elevation' in df.columns:
                # make sure that gauge_elevation is a string
                df['gauge_elevation'] = df['gauge_elevation'].astype(str)
                
                # replace comma with dot
                df['gauge_elevation'] = df['gauge_elevation'].str.replace(',', '.')

                # remove all non numeric characters
                df['gauge_elevation'] = df['gauge_elevation'].str.extract('([\d.]+)').astype(float)

                # transform to float
                df['gauge_elevation'] = df['gauge_elevation'].astype(float)
        except FileNotFoundError:
            continue
        # update metadata
        bl.update_metadata(df, id_column='provider_id')

util.get_metadata()

100%|██████████| 16/16 [00:00<00:00, 65.97it/s]


Unnamed: 0,provider_id,camels_id,camels_path,nuts_lvl2,federal_state,gauge_name,waterbody_name,gauge_elevation
0,573000,DEG10000,./DEG/DEG10000/DEG10000_data.csv,DEG,Thüringen,Ammern,Unstrut,210.243
1,447000,DEG10010,./DEG/DEG10010/DEG10010_data.csv,DEG,Thüringen,Arenshausen,Leine,196.288
2,574200,DEG10020,./DEG/DEG10020/DEG10020_data.csv,DEG,Thüringen,Arnstadt,Gera,293.577
3,576500,DEG10030,./DEG/DEG10030/DEG10030_data.csv,DEG,Thüringen,Berga,Weiße Elster,218.995
4,570210,DEG10040,./DEG/DEG10040/DEG10040_data.csv,DEG,Thüringen,Blankenstein-Rosenthal,Saale,410.517
...,...,...,...,...,...,...,...,...
3005,76273,DE112470,./DE1/DE112470/DE112470_data.csv,DE1,Baden-Württemberg,Blaubeuren,Blautopf,511.871
3006,76274,DE112480,./DE1/DE112480/DE112480_data.csv,DE1,Baden-Württemberg,Erlenbach,Sulm,160.832
3007,76276,DE112490,./DE1/DE112490/DE112490_data.csv,DE1,Baden-Württemberg,Bolheim,Brenz,473.000
3008,76290,DE112500,./DE1/DE112500/DE112500_data.csv,DE1,Baden-Württemberg,Schweinhausen,Riß,541.098


### Add location

In [4]:
for NUTS in tqdm(util._NUTS_LVL2_NAMES.keys()):
    with Bundesland(NUTS) as bl:
        try:
            p = os.path.join(bl.base_path, 'locations', f'{bl.NUTS}_Locations.csv')
            # read in 
            df = pd.read_csv(p, dtype={'ID': str})
            df.columns = ['provider_id', 'area', 'x', 'y']
            #df.drop_duplicates(inplace=True)
        except FileNotFoundError:
            continue
        
        # update by simply setting the new metadata to the property setter
        # in this case, the joining column needs to be 'camels_id' or 'provider_id'
        #bl.metadata = df

        # or use the function if you prefer
        bl.update_metadata(df, id_column='provider_id')

metadata = util.get_metadata()
metadata[metadata['nuts_lvl2'] == 'DE9']

100%|██████████| 16/16 [00:00<00:00, 57.21it/s]


Unnamed: 0,provider_id,camels_id,camels_path,nuts_lvl2,federal_state,gauge_name,waterbody_name,gauge_elevation,area,x,y
1369,3183101,DE910000,./DE9/DE910000/DE910000_data.csv,DE9,Niedersachsen,Sudendorf,,,121.560123,4.183280e+06,3.217135e+06
1370,3346103,DE910010,./DE9/DE910010/DE910010_data.csv,DE9,Niedersachsen,Schwege,,,47.371025,4.178458e+06,3.218442e+06
1371,3437108,DE910020,./DE9/DE910020/DE910020_data.csv,DE9,Niedersachsen,Beesten,,,407.147698,4.150721e+06,3.261657e+06
1372,3445100,DE910030,./DE9/DE910030/DE910030_data.csv,DE9,Niedersachsen,Spelle,,,,inf,inf
1373,3449100,DE910040,./DE9/DE910040/DE910040_data.csv,DE9,Niedersachsen,Spelle,,,149.735031,4.155177e+06,3.254250e+06
...,...,...,...,...,...,...,...,...,...,...,...
1658,5934145,DE912890,./DE9/DE912890/DE912890_data.csv,DE9,Niedersachsen,Jeetzel UW,,,,inf,inf
1659,5985101,DE912900,./DE9/DE912900/DE912900_data.csv,DE9,Niedersachsen,Bremervörde,,,,inf,inf
1660,5986107,DE912910,./DE9/DE912910/DE912910_data.csv,DE9,Niedersachsen,Hollen,,,,inf,inf
1661,9286164,DE912920,./DE9/DE912920/DE912920_data.csv,DE9,Niedersachsen,Laar,,,1749.573210,4.100147e+06,3.283012e+06


## Add WGS84 coordinates

In [5]:
# create a transformer
transformer = Transformer.from_crs("EPSG:3035", "EPSG:4326", always_xy=True)

# transform
lon, lat = transformer.transform(metadata.x.values, metadata.y.values)

# add back
updates = pd.DataFrame({'camels_id': metadata.camels_id, 'lon': lon, 'lat': lat})
util.update_metadata(new_metadata=updates)

metadata = util.get_metadata()
metadata[metadata['nuts_lvl2'] == 'DE9']

Unnamed: 0,camels_id,provider_id,camels_path,nuts_lvl2,federal_state,gauge_name,waterbody_name,gauge_elevation,area,x,y,lon,lat
1369,DE910000,3183101,./DE9/DE910000/DE910000_data.csv,DE9,Niedersachsen,Sudendorf,,,121.560123,4.183280e+06,3.217135e+06,7.992302,52.047079
1370,DE910010,3346103,./DE9/DE910010/DE910010_data.csv,DE9,Niedersachsen,Schwege,,,47.371025,4.178458e+06,3.218442e+06,7.921498,52.057597
1371,DE910020,3437108,./DE9/DE910020/DE910020_data.csv,DE9,Niedersachsen,Beesten,,,407.147698,4.150721e+06,3.261657e+06,7.495577,52.437915
1372,DE910030,3445100,./DE9/DE910030/DE910030_data.csv,DE9,Niedersachsen,Spelle,,,,inf,inf,inf,inf
1373,DE910040,3449100,./DE9/DE910040/DE910040_data.csv,DE9,Niedersachsen,Spelle,,,149.735031,4.155177e+06,3.254250e+06,7.564736,52.372753
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1658,DE912890,5934145,./DE9/DE912890/DE912890_data.csv,DE9,Niedersachsen,Jeetzel UW,,,,inf,inf,inf,inf
1659,DE912900,5985101,./DE9/DE912900/DE912900_data.csv,DE9,Niedersachsen,Bremervörde,,,,inf,inf,inf,inf
1660,DE912910,5986107,./DE9/DE912910/DE912910_data.csv,DE9,Niedersachsen,Hollen,,,,inf,inf,inf,inf
1661,DE912920,9286164,./DE9/DE912920/DE912920_data.csv,DE9,Niedersachsen,Laar,,,1749.573210,4.100147e+06,3.283012e+06,6.738424,52.611652


## Count existing data

Go for each file and count the available data. Add anything that makes it necessary to read every single data file into the for-loop below.

Checking columns:

* `'q'` 
* `'w'`

In [6]:
for NUTS in util._NUTS_LVL2_NAMES.keys():
    # empty container for this BL
    count_q = []
    count_w = []
    
    # process this federal state
    with Bundesland(NUTS) as bl:
        # get meta
        meta = bl.metadata

        # go for each id
        for camels_id in tqdm(meta.camels_id.values, desc=NUTS):
            # load the data
            try:
                df = bl.get_data(camels_id)
            except FileNotFoundError:
                count_q.append(0)
                count_w.append(0)
                continue
    
            # check q
            if 'q' in df.columns.values:
                count_q.append((~df.q.isna()).count())
            else:
                count_q.append(0)

            # check w
            if 'w' in df.columns.values:
                count_w.append((~df.w.isna()).count())
            else:
                count_w.append(0)

        # build the new metadata
        counts = pd.DataFrame({'camels_id': meta.camels_id.values, 'q_count': np.asarray(count_q, dtype=int), 'w_count': np.asarray(count_w, dtype=int)})

        # add to metadata
        bl.update_metadata(counts)

metadata = util.get_metadata()
metadata[metadata['nuts_lvl2'] == 'DE9']

DE1:   0%|          | 0/252 [00:00<?, ?it/s]

DE1: 100%|██████████| 252/252 [00:04<00:00, 59.15it/s]
DE2: 100%|██████████| 535/535 [00:10<00:00, 49.22it/s]
DE3: 0it [00:00, ?it/s]
DE4: 100%|██████████| 233/233 [00:03<00:00, 59.57it/s]
DE5: 0it [00:00, ?it/s]
DE6: 0it [00:00, ?it/s]
DE7: 100%|██████████| 97/97 [00:01<00:00, 61.73it/s]
DE8: 100%|██████████| 230/230 [00:01<00:00, 119.42it/s]
DE9: 100%|██████████| 294/294 [00:03<00:00, 95.81it/s] 
DEA: 100%|██████████| 219/219 [00:02<00:00, 78.45it/s]
DEB: 100%|██████████| 124/124 [00:01<00:00, 68.55it/s]
DEC: 100%|██████████| 46/46 [00:00<00:00, 62.08it/s]
DED: 100%|██████████| 282/282 [00:03<00:00, 86.15it/s]
DEE: 100%|██████████| 126/126 [00:02<00:00, 49.75it/s]
DEF: 100%|██████████| 509/509 [00:05<00:00, 100.78it/s]
DEG: 100%|██████████| 63/63 [00:01<00:00, 38.07it/s]


Unnamed: 0,camels_id,provider_id,camels_path,nuts_lvl2,federal_state,gauge_name,waterbody_name,gauge_elevation,area,x,y,lon,lat,q_count,w_count
1369,DE910000,3183101,./DE9/DE910000/DE910000_data.csv,DE9,Niedersachsen,Sudendorf,,,121.560123,4.183280e+06,3.217135e+06,7.992302,52.047079,12845.0,12845.0
1370,DE910010,3346103,./DE9/DE910010/DE910010_data.csv,DE9,Niedersachsen,Schwege,,,47.371025,4.178458e+06,3.218442e+06,7.921498,52.057597,12845.0,12845.0
1371,DE910020,3437108,./DE9/DE910020/DE910020_data.csv,DE9,Niedersachsen,Beesten,,,407.147698,4.150721e+06,3.261657e+06,7.495577,52.437915,5966.0,5966.0
1372,DE910030,3445100,./DE9/DE910030/DE910030_data.csv,DE9,Niedersachsen,Spelle,,,,inf,inf,inf,inf,6789.0,6789.0
1373,DE910040,3449100,./DE9/DE910040/DE910040_data.csv,DE9,Niedersachsen,Spelle,,,149.735031,4.155177e+06,3.254250e+06,7.564736,52.372753,4597.0,4597.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1658,DE912890,5934145,./DE9/DE912890/DE912890_data.csv,DE9,Niedersachsen,Jeetzel UW,,,,inf,inf,inf,inf,0.0,18689.0
1659,DE912900,5985101,./DE9/DE912900/DE912900_data.csv,DE9,Niedersachsen,Bremervörde,,,,inf,inf,inf,inf,0.0,9923.0
1660,DE912910,5986107,./DE9/DE912910/DE912910_data.csv,DE9,Niedersachsen,Hollen,,,,inf,inf,inf,inf,0.0,12845.0
1661,DE912920,9286164,./DE9/DE912920/DE912920_data.csv,DE9,Niedersachsen,Laar,,,1749.573210,4.100147e+06,3.283012e+06,6.738424,52.611652,0.0,19450.0


## Add extent of available Q and W data in years

In [7]:
# get metadata
meta = util.get_metadata()

# get camels_ids
camels_ids = meta['camels_id'].values

for id in tqdm(camels_ids):
    # init Station
    s = Station(id)

    # get the data
    df = s.get_data()

    # get extent of date index of q and w
    if 'q' in df.columns:
        q_start, q_end = df['q'].dropna().index.min(), df['q'].dropna().index.max()
        
        # compute extent in years
        q_extent = (q_end - q_start).days / 365
        
        # add to metadata
        meta.loc[meta.camels_id == id, 'q_extent_years'] = q_extent
    else:
        meta.loc[meta.camels_id == id, 'q_extent_years'] = np.nan
    
    if 'w' in df.columns:
        w_start, w_end = df['w'].dropna().index.min(), df['w'].dropna().index.max()
        
        # compute extent in years
        w_extent = (w_end - w_start).days / 365
        
        # add to metadata
        meta.loc[meta.camels_id == id, 'w_extent_years'] = w_extent
    else:
        meta.loc[meta.camels_id == id, 'w_extent_years'] = np.nan

# save metadata
meta.to_csv(os.path.join(util.get_output_path(), 'metadata', 'metadata.csv'), index=False)

util.get_metadata().head()

  0%|          | 0/3010 [00:00<?, ?it/s]

100%|██████████| 3010/3010 [01:30<00:00, 33.19it/s]


Unnamed: 0,camels_id,provider_id,camels_path,nuts_lvl2,federal_state,gauge_name,waterbody_name,gauge_elevation,area,x,y,lon,lat,q_count,w_count,q_extent_years,w_extent_years
0,DEG10000,573000,./DEG/DEG10000/DEG10000_data.csv,DEG,Thüringen,Ammern,Unstrut,210.243,182.7,4352221.0,3124617.0,10.446993,51.231727,29646.0,29646.0,81.219178,32.186301
1,DEG10010,447000,./DEG/DEG10010/DEG10010_data.csv,DEG,Thüringen,Arenshausen,Leine,196.288,275.0,4318941.0,3140875.0,9.970428,51.378709,22707.0,22707.0,62.208219,59.876712
2,DEG10020,574200,./DEG/DEG10020/DEG10020_data.csv,DEG,Thüringen,Arnstadt,Gera,293.577,174.7,4386764.0,3077926.0,10.933022,50.809106,35490.0,35490.0,97.230137,32.186301
3,DEG10030,576500,./DEG/DEG10030/DEG10030_data.csv,DEG,Thüringen,Berga,Weiße Elster,218.995,1383.0,4473276.0,3073272.0,12.157989,50.750857,12845.0,12845.0,31.186301,35.189041
4,DEG10040,570210,./DEG/DEG10040/DEG10040_data.csv,DEG,Thüringen,Blankenstein-Rosenthal,Saale,410.517,1013.0,4442190.0,3033884.0,11.704738,50.404273,21246.0,21246.0,58.205479,52.032877


In [8]:
from glob import glob

# empty_data = []
# empty_files = []

for NUTS in util._NUTS_LVL2_NAMES.keys():

    bl = Bundesland(NUTS)

    print(NUTS)

#     # empty data
#     for id in metadata['camels_id'].values:
#         try:
#             df = bl.get_data(id)
#             if len(df) == 0:
#                 empty_data.append(id)
#         except FileNotFoundError:
#             empty_files.append(id)

    print(f"{len(bl.metadata[(bl.metadata['q_count'] == 0) & (bl.metadata['w_count'] == 0)])}")

    # empty folders
    all_folders = glob(f"{bl.base_path}/{NUTS}/*")

    print("Empty folders:")
    print(f"{len([folder for folder in all_folders if len(os.listdir(folder)) == 0])}\n")

DE1
0
Empty folders:
0

DE2
0
Empty folders:
0

DE3
0
Empty folders:
0

DE4
0
Empty folders:
0

DE5
0
Empty folders:
0

DE6
0
Empty folders:
0

DE7
0
Empty folders:
0

DE8
0
Empty folders:
0

DE9
0
Empty folders:
0

DEA
0
Empty folders:
0

DEB
0
Empty folders:
0

DEC
0
Empty folders:
0

DED
0
Empty folders:
0

DEE
0
Empty folders:
0

DEF
0
Empty folders:
0

DEG
0
Empty folders:
0



In [9]:
# metadata where lon or lat is inf
ids_meta = metadata[(metadata['lon'] == np.inf) | (metadata['lat'] == np.inf)]
ids_meta

Unnamed: 0,camels_id,provider_id,camels_path,nuts_lvl2,federal_state,gauge_name,waterbody_name,gauge_elevation,area,x,y,lon,lat,q_count,w_count
69,DEF10060,114300,./DEF/DEF10060/DEF10060_data.csv,DEF,Schleswig-Holstein,Ahrensburg,Aue,,-999.0,inf,inf,inf,inf,699.0,699.0
481,DEF14180,114254,./DEF/DEF14180/DEF14180_data.csv,DEF,Schleswig-Holstein,Sörup,Bönstrupau,,-999.0,inf,inf,inf,inf,577.0,577.0
698,DED10000,551431,./DED/DED10000/DED10000_data.csv,DED,Sachsen,Dippoldiswalde 3,Werkgraben,,,inf,inf,inf,inf,19784.0,19784.0
702,DED10040,564200,./DED/DED10040/DED10040_data.csv,DED,Sachsen,Niedermülsen,Mülsenbach,,,inf,inf,inf,inf,8766.0,8766.0
704,DED10060,576410,./DED/DED10060/DED10060_data.csv,DED,Sachsen,Oelsnitz,Weiße Elster,,,inf,inf,inf,inf,8339.0,8339.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1653,DE912840,4781106,./DE9/DE912840/DE912840_data.csv,DE9,Niedersachsen,Wilhelmstein,,,,inf,inf,inf,inf,0.0,17958.0
1656,DE912870,4961130,./DE9/DE912870/DE912870_data.csv,DE9,Niedersachsen,Dümmer-Ost,,,,inf,inf,inf,inf,0.0,21915.0
1658,DE912890,5934145,./DE9/DE912890/DE912890_data.csv,DE9,Niedersachsen,Jeetzel UW,,,,inf,inf,inf,inf,0.0,18689.0
1659,DE912900,5985101,./DE9/DE912900/DE912900_data.csv,DE9,Niedersachsen,Bremervörde,,,,inf,inf,inf,inf,0.0,9923.0
