# Merge Metadata

This script can be used to build the final Metadata file.

There are several notes that are important:

* Some of the location files have been processed externally and the workflow needs to be explained here
* We should note done all CRS transformations applied for reference

In [1]:
import os
import pandas as pd
import numpy as np
from tqdm import tqdm
from pyproj.transformer import Transformer

from camelsp import Bundesland, Station, util

## Generate basic metadata

This step will produce one metadata file containing all processed data, which can be used as NUTS lookup and as a basis to add more specific metadata.
The first step also loads the Location files and merges everything

### Add Pegelname, Gewässername and elevation

add the above fields to the merged metadata, if we have the information from the Landesämter.

In [30]:
# lookup dictionary for column names in raw_metadata
_META_DICT = {
    'DE1': {'provider_id': 'Messstellennummer', 'gauge_name': 'Standort', 'waterbody_name': 'Gewässer', 'gauge_elevation': 'Pegelnullpunkt (PNP) in m'},
    'DE2': {'provider_id': 'Stationsnummer', 'gauge_name': 'Stationsname', 'waterbody_name': 'Gewässer (Name|Nummer)', 'gauge_elevation': 'PNP'},
    'DE4': {'provider_id': 'Messstellennummer', 'gauge_name': 'Pegelname', 'waterbody_name': 'Gewaesser', 'gauge_elevation': 'PNP_Höhe'},
    'DE7': {'provider_id': 'Messstellen Nr.', 'gauge_name': 'Pegelname', 'waterbody_name': 'Gewässer', 'gauge_elevation': 'Höhe              [m ü. NN]'},
    'DE8': {'provider_id': 'pegelkennzahl', 'gauge_name': 'bezeichnung', 'waterbody_name': 'gewaesser', 'gauge_elevation': 'pnp'},
    'DE9': {'provider_id': 'MESSSTELLE_NR', 'gauge_name': 'LANGNAME', 'waterbody_name': 'GEWAESSER', 'gauge_elevation': np.nan},
    'DEA': {'provider_id': 'Stationsnummer', 'gauge_name': 'Station', 'waterbody_name': 'Gewässer', 'gauge_elevation': 'NULLPUNKT'},
    'DEB': {'provider_id': 'Nummer', 'gauge_name': 'Stationsname', 'waterbody_name': 'Gewässer', 'gauge_elevation': 'PNP'},
    'DEC': {'provider_id': 'MSTNR', 'gauge_name': 'Pegelname_', 'waterbody_name': 'Gewässer', 'gauge_elevation': 'PNP'},
    'DED': {'provider_id': 'Pegelkennziffer', 'gauge_name': 'Pegelname', 'waterbody_name': 'Gewaesser', 'gauge_elevation': np.nan},
    'DEE': {'provider_id': 'SANR', 'gauge_name': 'SNAME', 'waterbody_name': 'SWATER', 'gauge_elevation': 'PNP mNN'},
    'DEF': {'provider_id': 'id', 'gauge_name': 'gauge', 'waterbody_name': 'river', 'gauge_elevation': np.nan},
    'DEG': {'provider_id': 'Pegelnr', 'gauge_name': 'Pegelname', 'waterbody_name': 'Gewässer', 'gauge_elevation': 'PNP'},
}

for NUTS in tqdm(util._NUTS_LVL2_NAMES.keys()):
    with Bundesland(NUTS) as bl:
        try:
            # read raw metadata for bl
            p = os.path.join(bl.base_path, 'raw_metadata', f"{bl.NUTS}_raw_metadata.csv")
            df = pd.read_csv(p)

            # get relevant metadata columns from lookup dict
            cols = list(_META_DICT[NUTS].values())
            # drop nan from cols
            cols = [col for col in cols if not pd.isna(col)]

            # select relevant columns
            df = df[cols]
            
            # rename columns
            df = df.rename(columns=dict((v,k) for k,v in _META_DICT[NUTS].items()))

            # make provider_id a string
            df['provider_id'] = df['provider_id'].astype(str)

            # transform gauge_elevation to float
            if 'gauge_elevation' in df.columns:
                # make sure that gauge_elevation is a string
                df['gauge_elevation'] = df['gauge_elevation'].astype(str)
                
                # replace comma with dot
                df['gauge_elevation'] = df['gauge_elevation'].str.replace(',', '.')

                # remove all non numeric characters
                df['gauge_elevation'] = df['gauge_elevation'].str.extract('([\d.]+)').astype(float)

                # transform to float
                df['gauge_elevation'] = df['gauge_elevation'].astype(float)
        except FileNotFoundError:
            continue
        # update metadata
        bl.update_metadata(df, id_column='provider_id')

util.get_metadata()

100%|██████████| 16/16 [00:00<00:00, 36.65it/s]


Unnamed: 0,provider_id,camels_id,camels_path,nuts_lvl2,federal_state,gauge_name,waterbody_name,gauge_elevation
0,440003,DEE10000,./DEE/DEE10000/DEE10000_data.csv,DEE,Sachsen-Anhalt,Ummendorf,Aller,124.900
1,440004,DEE10010,./DEE/DEE10010/DEE10010_data.csv,DEE,Sachsen-Anhalt,Alleringersleben,Aller,113.240
2,440008,DEE10020,./DEE/DEE10020/DEE10020_data.csv,DEE,Sachsen-Anhalt,Walbeck,Aller,94.340
3,440010,DEE10030,./DEE/DEE10030/DEE10030_data.csv,DEE,Sachsen-Anhalt,Weferlingen,Aller,84.320
4,441201,DEE10040,./DEE/DEE10040/DEE10040_data.csv,DEE,Sachsen-Anhalt,Hödingen,Schölecke,93.970
...,...,...,...,...,...,...,...,...
3005,76273,DE112470,./DE1/DE112470/DE112470_data.csv,DE1,Baden-Württemberg,Blaubeuren,Blautopf,511.871
3006,76274,DE112480,./DE1/DE112480/DE112480_data.csv,DE1,Baden-Württemberg,Erlenbach,Sulm,160.832
3007,76276,DE112490,./DE1/DE112490/DE112490_data.csv,DE1,Baden-Württemberg,Bolheim,Brenz,473.000
3008,76290,DE112500,./DE1/DE112500/DE112500_data.csv,DE1,Baden-Württemberg,Schweinhausen,Riß,541.098


### Add location

In [31]:
for NUTS in tqdm(util._NUTS_LVL2_NAMES.keys()):
    with Bundesland(NUTS) as bl:
        try:
            p = os.path.join(bl.base_path, 'locations', f'{bl.NUTS}_Locations.csv')
            # read in 
            df = pd.read_csv(p, dtype={'ID': str})
            df.columns = ['provider_id', 'area', 'x', 'y']
            #df.drop_duplicates(inplace=True)
        except FileNotFoundError:
            continue
        
        # update by simply setting the new metadata to the property setter
        # in this case, the joining column needs to be 'camels_id' or 'provider_id'
        #bl.metadata = df

        # or use the function if you prefer
        bl.update_metadata(df, id_column='provider_id')

metadata = util.get_metadata()
metadata

100%|██████████| 16/16 [00:00<00:00, 30.92it/s]


Unnamed: 0,provider_id,camels_id,camels_path,nuts_lvl2,federal_state,gauge_name,waterbody_name,gauge_elevation,area,x,y
0,440003,DEE10000,./DEE/DEE10000/DEE10000_data.csv,DEE,Sachsen-Anhalt,Ummendorf,Aller,124.900,52.300,4.402365e+06,3.227863e+06
1,440004,DEE10010,./DEE/DEE10010/DEE10010_data.csv,DEE,Sachsen-Anhalt,Alleringersleben,Aller,113.240,142.000,4.398520e+06,3.234256e+06
2,440008,DEE10020,./DEE/DEE10020/DEE10020_data.csv,DEE,Sachsen-Anhalt,Walbeck,Aller,94.340,201.000,4.393928e+06,3.241476e+06
3,440010,DEE10030,./DEE/DEE10030/DEE10030_data.csv,DEE,Sachsen-Anhalt,Weferlingen,Aller,84.320,238.000,4.393036e+06,3.245503e+06
4,441201,DEE10040,./DEE/DEE10040/DEE10040_data.csv,DEE,Sachsen-Anhalt,Hödingen,Schölecke,93.970,23.000,4.396340e+06,3.243827e+06
...,...,...,...,...,...,...,...,...,...,...,...
3005,76273,DE112470,./DE1/DE112470/DE112470_data.csv,DE1,Baden-Württemberg,Blaubeuren,Blautopf,511.871,0.067,4.304981e+06,2.811435e+06
3006,76274,DE112480,./DE1/DE112480/DE112480_data.csv,DE1,Baden-Württemberg,Erlenbach,Sulm,160.832,101.510,4.267341e+06,2.895499e+06
3007,76276,DE112490,./DE1/DE112490/DE112490_data.csv,DE1,Baden-Württemberg,Bolheim,Brenz,473.000,339.811,4.332390e+06,2.835208e+06
3008,76290,DE112500,./DE1/DE112500/DE112500_data.csv,DE1,Baden-Württemberg,Schweinhausen,Riß,541.098,101.589,4.305130e+06,2.769259e+06


## Add WGS84 coordinates

In [32]:
# create a transformer
transformer = Transformer.from_crs("EPSG:3035", "EPSG:4326", always_xy=True)

# transform
lon, lat = transformer.transform(metadata.x.values, metadata.y.values)

# add back
updates = pd.DataFrame({'camels_id': metadata.camels_id, 'lon': lon, 'lat': lat})
util.update_metadata(new_metadata=updates)

metadata = util.get_metadata()
metadata

Unnamed: 0,camels_id,provider_id,camels_path,nuts_lvl2,federal_state,gauge_name,waterbody_name,gauge_elevation,area,x,y,lon,lat
0,DEE10000,440003,./DEE/DEE10000/DEE10000_data.csv,DEE,Sachsen-Anhalt,Ummendorf,Aller,124.900,52.300,4.402365e+06,3.227863e+06,11.188884,52.154576
1,DEE10010,440004,./DEE/DEE10010/DEE10010_data.csv,DEE,Sachsen-Anhalt,Alleringersleben,Aller,113.240,142.000,4.398520e+06,3.234256e+06,11.134176,52.212572
2,DEE10020,440008,./DEE/DEE10020/DEE10020_data.csv,DEE,Sachsen-Anhalt,Walbeck,Aller,94.340,201.000,4.393928e+06,3.241476e+06,11.068552,52.278070
3,DEE10030,440010,./DEE/DEE10030/DEE10030_data.csv,DEE,Sachsen-Anhalt,Weferlingen,Aller,84.320,238.000,4.393036e+06,3.245503e+06,11.056336,52.314377
4,DEE10040,441201,./DEE/DEE10040/DEE10040_data.csv,DEE,Sachsen-Anhalt,Hödingen,Schölecke,93.970,23.000,4.396340e+06,3.243827e+06,11.104414,52.298873
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3005,DE112470,76273,./DE1/DE112470/DE112470_data.csv,DE1,Baden-Württemberg,Blaubeuren,Blautopf,511.871,0.067,4.304981e+06,2.811435e+06,9.783683,48.416174
3006,DE112480,76274,./DE1/DE112480/DE112480_data.csv,DE1,Baden-Württemberg,Erlenbach,Sulm,160.832,101.510,4.267341e+06,2.895499e+06,9.264333,49.170156
3007,DE112490,76276,./DE1/DE112490/DE112490_data.csv,DE1,Baden-Württemberg,Bolheim,Brenz,473.000,339.811,4.332390e+06,2.835208e+06,10.154468,48.630150
3008,DE112500,76290,./DE1/DE112500/DE112500_data.csv,DE1,Baden-Württemberg,Schweinhausen,Riß,541.098,101.589,4.305130e+06,2.769259e+06,9.787300,48.036705


## Count existing data

Go for each file and count the available data. Add anything that makes it necessary to read every single data file into the for-loop below.

Checking columns:

* `'q'` 
* `'w'`

In [33]:
for NUTS in util._NUTS_LVL2_NAMES.keys():
    # empty container for this BL
    count_q = []
    count_w = []
    
    # process this federal state
    with Bundesland(NUTS) as bl:
        # get meta
        meta = bl.metadata

        # go for each id
        for camels_id in tqdm(meta.camels_id.values, desc=NUTS):
            # load the data
            try:
                df = bl.get_data(camels_id)
            except FileNotFoundError:
                count_q.append(0)
                count_w.append(0)
                continue
    
            # check q
            if 'q' in df.columns.values:
                count_q.append((~df.q.isna()).count())
            else:
                count_q.append(0)

            # check w
            if 'w' in df.columns.values:
                count_w.append((~df.w.isna()).count())
            else:
                count_w.append(0)

        # build the new metadata
        counts = pd.DataFrame({'camels_id': meta.camels_id.values, 'q_count': np.asarray(count_q, dtype=int), 'w_count': np.asarray(count_w, dtype=int)})

        # add to metadata
        bl.update_metadata(counts)

metadata = util.get_metadata()
metadata

DE1:   0%|          | 0/252 [00:00<?, ?it/s]

DE1: 100%|██████████| 252/252 [00:09<00:00, 27.32it/s]
DE2: 100%|██████████| 535/535 [00:23<00:00, 22.86it/s]
DE3: 0it [00:00, ?it/s]
DE4: 100%|██████████| 233/233 [00:08<00:00, 26.21it/s]
DE5: 0it [00:00, ?it/s]
DE6: 0it [00:00, ?it/s]
DE7: 100%|██████████| 97/97 [00:03<00:00, 27.17it/s]
DE8: 100%|██████████| 230/230 [00:04<00:00, 48.87it/s]
DE9: 100%|██████████| 294/294 [00:07<00:00, 41.95it/s]
DEA: 100%|██████████| 219/219 [00:06<00:00, 34.07it/s]
DEB: 100%|██████████| 124/124 [00:03<00:00, 37.30it/s]
DEC: 100%|██████████| 46/46 [00:01<00:00, 42.52it/s]
DED: 100%|██████████| 282/282 [00:05<00:00, 50.10it/s]
DEE: 100%|██████████| 126/126 [00:03<00:00, 31.74it/s]
DEF: 100%|██████████| 509/509 [00:08<00:00, 56.62it/s]
DEG: 100%|██████████| 63/63 [00:02<00:00, 21.87it/s]


Unnamed: 0,camels_id,provider_id,camels_path,nuts_lvl2,federal_state,gauge_name,waterbody_name,gauge_elevation,area,x,y,lon,lat,q_count,w_count
0,DEE10000,440003,./DEE/DEE10000/DEE10000_data.csv,DEE,Sachsen-Anhalt,Ummendorf,Aller,124.900,52.300,4.402365e+06,3.227863e+06,11.188884,52.154576,10684.0,10684.0
1,DEE10010,440004,./DEE/DEE10010/DEE10010_data.csv,DEE,Sachsen-Anhalt,Alleringersleben,Aller,113.240,142.000,4.398520e+06,3.234256e+06,11.134176,52.212572,28879.0,28879.0
2,DEE10020,440008,./DEE/DEE10020/DEE10020_data.csv,DEE,Sachsen-Anhalt,Walbeck,Aller,94.340,201.000,4.393928e+06,3.241476e+06,11.068552,52.278070,18355.0,18355.0
3,DEE10030,440010,./DEE/DEE10030/DEE10030_data.csv,DEE,Sachsen-Anhalt,Weferlingen,Aller,84.320,238.000,4.393036e+06,3.245503e+06,11.056336,52.314377,28951.0,28951.0
4,DEE10040,441201,./DEE/DEE10040/DEE10040_data.csv,DEE,Sachsen-Anhalt,Hödingen,Schölecke,93.970,23.000,4.396340e+06,3.243827e+06,11.104414,52.298873,15067.0,15067.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3005,DE112470,76273,./DE1/DE112470/DE112470_data.csv,DE1,Baden-Württemberg,Blaubeuren,Blautopf,511.871,0.067,4.304981e+06,2.811435e+06,9.783683,48.416174,30987.0,30987.0
3006,DE112480,76274,./DE1/DE112480/DE112480_data.csv,DE1,Baden-Württemberg,Erlenbach,Sulm,160.832,101.510,4.267341e+06,2.895499e+06,9.264333,49.170156,4140.0,4140.0
3007,DE112490,76276,./DE1/DE112490/DE112490_data.csv,DE1,Baden-Württemberg,Bolheim,Brenz,473.000,339.811,4.332390e+06,2.835208e+06,10.154468,48.630150,5555.0,5555.0
3008,DE112500,76290,./DE1/DE112500/DE112500_data.csv,DE1,Baden-Württemberg,Schweinhausen,Riß,541.098,101.589,4.305130e+06,2.769259e+06,9.787300,48.036705,8342.0,8342.0


## Add extent of available Q and W data in years

In [34]:
# get metadata
meta = util.get_metadata()

# get camels_ids
camels_ids = meta['camels_id'].values

for id in tqdm(camels_ids):
    # init Station
    s = Station(id)

    # get the data
    df = s.get_data()

    # get extent of date index of q and w
    if 'q' in df.columns:
        q_start, q_end = df['q'].dropna().index.min(), df['q'].dropna().index.max()
        
        # compute extent in years
        q_extent = (q_end - q_start).days / 365
        
        # add to metadata
        meta.loc[meta.camels_id == id, 'q_extent_years'] = q_extent
    else:
        meta.loc[meta.camels_id == id, 'q_extent_years'] = np.nan
    
    if 'w' in df.columns:
        w_start, w_end = df['w'].dropna().index.min(), df['w'].dropna().index.max()
        
        # compute extent in years
        w_extent = (w_end - w_start).days / 365
        
        # add to metadata
        meta.loc[meta.camels_id == id, 'w_extent_years'] = w_extent
    else:
        meta.loc[meta.camels_id == id, 'w_extent_years'] = np.nan

# save metadata
meta.to_csv(os.path.join(util.get_output_path(), 'metadata', 'metadata.csv'), index=False)

util.get_metadata().head()

  0%|          | 0/3010 [00:00<?, ?it/s]

100%|██████████| 3010/3010 [02:48<00:00, 17.86it/s]


Unnamed: 0,camels_id,provider_id,camels_path,nuts_lvl2,federal_state,gauge_name,waterbody_name,gauge_elevation,area,x,y,lon,lat,q_count,w_count,q_extent_years,w_extent_years
0,DEE10000,440003,./DEE/DEE10000/DEE10000_data.csv,DEE,Sachsen-Anhalt,Ummendorf,Aller,124.9,52.3,4402365.0,3227863.0,11.188884,52.154576,10684.0,10684.0,29.268493,9.389041
1,DEE10010,440004,./DEE/DEE10010/DEE10010_data.csv,DEE,Sachsen-Anhalt,Alleringersleben,Aller,113.24,142.0,4398520.0,3234256.0,11.134176,52.212572,28879.0,28879.0,51.361644,79.117808
2,DEE10020,440008,./DEE/DEE10020/DEE10020_data.csv,DEE,Sachsen-Anhalt,Walbeck,Aller,94.34,201.0,4393928.0,3241476.0,11.068552,52.27807,18355.0,18355.0,48.282192,50.284932
3,DEE10030,440010,./DEE/DEE10030/DEE10030_data.csv,DEE,Sachsen-Anhalt,Weferlingen,Aller,84.32,238.0,4393036.0,3245503.0,11.056336,52.314377,28951.0,28951.0,51.284932,79.315068
4,DEE10040,441201,./DEE/DEE10040/DEE10040_data.csv,DEE,Sachsen-Anhalt,Hödingen,Schölecke,93.97,23.0,4396340.0,3243827.0,11.104414,52.298873,15067.0,15067.0,29.268493,41.276712
