# Merge Metadata

This script can be used to build the final Metadata file.

There are several notes that are important:

* Some of the location files have been processed externally and the workflow needs to be explained here
* We should note done all CRS transformations applied for reference

In [1]:
import os
import json
import pandas as pd
import numpy as np
from tqdm import tqdm
from pyproj.transformer import Transformer

from camelsp import Bundesland, util

  def hasna(x: np.ndarray) -> bool:


As an example: The `Bundesland` context manager can load the metadata for the given Bundesland only from the full metadata table. If this table does not yet exist, it is created from the NUTSID mapping table. Check out for Saarland:

In [2]:
with Bundesland('DEB') as bl:
    dec_meta = bl.metadata

dec_meta

Unnamed: 0,camels_id,provider_id,camels_path,nuts_lvl2,federal_state,area,x,y,lon,lat,q_count,w_count,q_w_pearson,q_w_spearman
3167,DEB10000,2546015800,./DEB/DEB10000/DEB10000_data.csv,DEB,Rheinland-Pfalz,200.94,4.135583e+06,2.929086e+06,7.443029,49.445642,15388.0,15388.0,0.963048,0.980030
3168,DEB10010,2546030700,./DEB/DEB10010/DEB10010_data.csv,DEB,Rheinland-Pfalz,598.31,4.139014e+06,2.946132e+06,7.482398,49.599891,20907.0,20907.0,0.916515,0.900342
3169,DEB10020,2546040900,./DEB/DEB10020/DEB10020_data.csv,DEB,Rheinland-Pfalz,1088.17,4.151599e+06,2.955622e+06,7.652250,49.688918,25048.0,25048.0,0.906273,0.813526
3170,DEB10030,2546052200,./DEB/DEB10030/DEB10030_data.csv,DEB,Rheinland-Pfalz,34.5,4.131066e+06,2.926551e+06,7.382015,49.421427,0.0,0.0,,
3171,DEB10040,2546057700,./DEB/DEB10040/DEB10040_data.csv,DEB,Rheinland-Pfalz,19.44,4.151160e+06,2.931848e+06,7.656510,49.475124,0.0,0.0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3317,DEB11500,2716025700,./DEB/DEB11500/DEB11500_data.csv,DEB,Rheinland-Pfalz,,inf,inf,inf,inf,0.0,0.0,,
3318,DEB11510,2716050800,./DEB/DEB11510/DEB11510_data.csv,DEB,Rheinland-Pfalz,71.8,4.157039e+06,3.050256e+06,7.686723,50.540852,0.0,0.0,,
3319,DEB11520,2716055200,./DEB/DEB11520/DEB11520_data.csv,DEB,Rheinland-Pfalz,,inf,inf,inf,inf,0.0,0.0,,
3320,DEB11530,2628036600,./DEB/DEB11530/DEB11530_data.csv,DEB,Rheinland-Pfalz,,1.910371e+06,3.122243e+06,-22.462095,46.346123,0.0,0.0,,


## Generate basic metadata

This step will produce one metadata file containing all processed data, which can be used as NUTS lookup and as a basis to add more specific metadata.
The first step also loads the Location files and merges everything

### Add Pegelname, Gewässername and elevation

add the above fields to the merged metadata, if we have the information from the Landesämter.

In [84]:
bl = Bundesland('DEG')
p = os.path.join(bl.base_path, 'raw_metadata', f"{bl.NUTS}_raw_metadata.csv")

pd.read_csv(p)

Unnamed: 0,Pegelnr,Pegelname,Gewässer,Lage o. M.,EZG,PNP,Höhensystem,HW (GK 4),RW (GK 4),lon,lat,NNQ,Datum NNQ,HHQ,Datum HHQ,unit_q,unit_w
0,573000,Ammern,Unstrut,161.2,182.7,210.243,NH,5676589,601026,10.4470,51.2317,0.130,OFT,115.0,am 04.06.1981,m³/s,cm
1,447000,Arenshausen,Leine,247.1,275.0,196.288,NH,5692387,567538,9.9704,51.3787,0.260,am 09.09.2010,92.8,am 04.06.1981,m³/s,cm
2,574200,Arnstadt,Gera,45.2,174.7,293.577,NH,5630378,636190,10.9330,50.8091,0.210,OFT,75.7,am 10.08.1981,m³/s,cm
3,576500,Berga,Weiße Elster,151.0,1383.0,218.995,NH,5626876,722757,12.1580,50.7509,,,,,m³/s,cm
4,570210,Blankenstein-Rosenthal,Saale,357.0,1013.0,410.517,NH,5587078,692197,11.7047,50.4043,0.306,am 10.07.1976,251.0,am 05.01.1982,m³/s,cm
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
58,427010,Unterbreizbach-Räsa,Ulster,5.0,399.0,233.323,NH,5628816,568818,9.9767,50.8070,0.180,OFT,218.0,am 04.06.1981,m³/s,cm
59,420120,Vacha,Werra,164.8,2246.0,222.678,NH,5631886,573776,10.0477,50.8340,1.550,am 05.10.1959,321.0,am 10.02.1946,m³/s,cm
60,575110,Wasserthaleben,Helbe,19.0,374.3,174.317,NH,5680112,631983,10.8915,51.2571,0.100,OFT,64.9,am 30.12.2002,m³/s,cm
61,577320,Weida,Weida,7.0,296.7,238.358,NH,5627781,715938,12.0620,50.7616,0.000,OFT,139.0,am 15.08.1924,m³/s,cm


In [117]:
# lookup dictionary for column names in raw_metadata
_META_DICT = {
    'DE1': {'provider_id': 'Messstellennummer', 'gauge_name': 'Standort', 'waterbody_name': 'Gewässer', 'gauge_elevation': 'Pegelnullpunkt (PNP) in m'},
    'DE2': {'provider_id': 'Stationsnummer', 'gauge_name': 'Stationsname', 'waterbody_name': 'Gewässer (Name|Nummer)', 'gauge_elevation': 'PNP'},
    'DE4': {'provider_id': 'Messstellennummer', 'gauge_name': 'Pegelname', 'waterbody_name': 'Gewaesser', 'gauge_elevation': 'PNP_Höhe'},
    'DE7': {'provider_id': 'Messstellen Nr.', 'gauge_name': 'Pegelname', 'waterbody_name': 'Gewässer', 'gauge_elevation': 'Höhe              [m ü. NN]'},
    'DE8': {'provider_id': 'pegelkennzahl', 'gauge_name': 'bezeichnung', 'waterbody_name': 'gewaesser', 'gauge_elevation': 'pnp'},
    'DE9': {'provider_id': 'MESSSTELLE_NR', 'gauge_name': 'LANGNAME', 'waterbody_name': np.nan, 'gauge_elevation': np.nan},
    'DEA': {'provider_id': 'Stationsnummer', 'gauge_name': 'Station', 'waterbody_name': 'Gewässer', 'gauge_elevation': 'NULLPUNKT'},
    'DEB': {'provider_id': 'Nummer', 'gauge_name': 'Stationsname', 'waterbody_name': 'Gewässer', 'gauge_elevation': 'PNP'},
    'DEC': {'provider_id': 'MSTNR', 'gauge_name': 'Pegelname_', 'waterbody_name': 'Gewässer', 'gauge_elevation': 'PNP'},
    'DED': {'provider_id': 'Pegelkennziffer', 'gauge_name': 'Pegelname', 'waterbody_name': 'Gewaesser', 'gauge_elevation': np.nan},
    'DEE': {'provider_id': 'SANR', 'gauge_name': 'SNAME', 'waterbody_name': 'SWATER', 'gauge_elevation': np.nan},
    'DEF': {'provider_id': 'id', 'gauge_name': 'gauge', 'waterbody_name': 'river', 'gauge_elevation': np.nan},
    'DEG': {'provider_id': 'Pegelnr', 'gauge_name': 'Pegelname', 'waterbody_name': 'Gewässer', 'gauge_elevation': 'PNP'},
}

for NUTS in tqdm(util._NUTS_LVL2_NAMES.keys()):
    with Bundesland(NUTS) as bl:
        try:
            # read raw metadata for bl
            p = os.path.join(bl.base_path, 'raw_metadata', f"{bl.NUTS}_raw_metadata.csv")
            df = pd.read_csv(p)

            # get relevant metadata columns from lookup dict
            cols = list(_META_DICT[NUTS].values())
            # drop nan from cols
            cols = [col for col in cols if not pd.isna(col)]

            # select relevant columns
            df = df[cols]
            
            # rename columns
            df = df.rename(columns=dict((v,k) for k,v in _META_DICT[NUTS].items()))

            # make provider_id a string
            df['provider_id'] = df['provider_id'].astype(str)

            # transform gauge_elevation to float
            if 'gauge_elevation' in df.columns:
                # make sure that gauge_elevation is a string
                df['gauge_elevation'] = df['gauge_elevation'].astype(str)
                
                # replace comma with dot
                df['gauge_elevation'] = df['gauge_elevation'].str.replace(',', '.')

                # remove all non numeric characters
                df['gauge_elevation'] = df['gauge_elevation'].str.extract('([\d.]+)').astype(float)

                # transform to float
                df['gauge_elevation'] = df['gauge_elevation'].astype(float)
        except FileNotFoundError:
            continue
        
        # update metadata
        bl.update_metadata(df, id_column='provider_id')

util.get_metadata()

100%|██████████| 16/16 [00:00<00:00, 35.14it/s]


Unnamed: 0,provider_id,camels_id,camels_path,nuts_lvl2,federal_state,area,x,y,lon,lat,q_count,w_count,q_w_pearson,q_w_spearman,gauge_name,waterbody_name,gauge_elevation
0,5860200,DE410000,./DE4/DE410000/DE410000_data.csv,DE4,Brandenburg,68.88,4.583855e+06,3.281197e+06,13.879427,52.576841,0.0,18930.0,,,"Strausberg, Fähre",Straussee,64.138
1,5873101,DE410010,./DE4/DE410010/DE410010_data.csv,DE4,Brandenburg,226.69,4.507876e+06,3.232843e+06,12.732362,52.173797,21021.0,21021.0,0.699270,0.653806,Trebitz,Plane,47.397
2,5896202,DE410020,./DE4/DE410020/DE410020_data.csv,DE4,Brandenburg,211.16,4.487561e+06,3.342248e+06,12.490668,53.162802,14597.0,14597.0,0.889058,0.872906,Wittstock UP,Dosse,60.492
3,6602800,DE410030,./DE4/DE410030/DE410030_data.csv,DE4,Brandenburg,4064.98,4.644224e+06,3.217008e+06,14.706919,51.969154,22998.0,22998.0,0.868307,0.897783,Guben 2,Lausitzer Neiße,37.474
4,5856800,DE410040,./DE4/DE410040/DE410040_data.csv,DE4,Brandenburg,105.11,4.560622e+06,3.245010e+06,13.511195,52.262716,0.0,17059.0,,,"Mittenwalde, Verteilerwehr OP",Zülow-Kanal,34.055
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3317,2716025700,DEB11500,./DEB/DEB11500/DEB11500_data.csv,DEB,Rheinland-Pfalz,,inf,inf,inf,inf,0.0,0.0,,,Steinshof,Wied,
3318,2716050800,DEB11510,./DEB/DEB11510/DEB11510_data.csv,DEB,Rheinland-Pfalz,71.8,4.157039e+06,3.050256e+06,7.686723,50.540852,0.0,0.0,,,Brückrachdorf,Holzbach,
3319,2716055200,DEB11520,./DEB/DEB11520/DEB11520_data.csv,DEB,Rheinland-Pfalz,,inf,inf,inf,inf,0.0,0.0,,,Dierdorf,Holzbach,
3320,2628036600,DEB11530,./DEB/DEB11530/DEB11530_data.csv,DEB,Rheinland-Pfalz,,1.910371e+06,3.122243e+06,-22.462095,46.346123,0.0,0.0,,,Bitburg Stausee,"Prüm, Stausee Bitburg",


In [116]:
meta = pd.read_csv('../output_data/metadata/metadata.csv')

# drop column id
meta = meta.drop(columns=['id'])
meta.to_csv('../output_data/metadata/metadata.csv', index=False)

In [3]:
for NUTS in tqdm(util._NUTS_LVL2_NAMES.keys()):
    with Bundesland(NUTS) as bl:
        try:
            p = os.path.join(bl.base_path, 'locations', f'{bl.NUTS}_Locations.csv')
            # read in 
            df = pd.read_csv(p, dtype={'ID': str})
            df.columns = ['provider_id', 'area', 'x', 'y']
            #df.drop_duplicates(inplace=True)
        except FileNotFoundError:
            continue
        
        # update by simply setting the new metadata to the property setter
        # in this case, the joining column needs to be 'camels_id' or 'provider_id'
        #bl.metadata = df

        # or use the function if you prefer
        bl.update_metadata(df, id_column='provider_id')

metadata = util.get_metadata()
metadata[metadata['nuts_lvl2'] == 'DEB']

Unnamed: 0,provider_id,camels_id,camels_path,nuts_lvl2,federal_state,area,x,y,lon,lat,q_count,w_count,q_w_pearson,q_w_spearman
3167,2546015800,DEB10000,./DEB/DEB10000/DEB10000_data.csv,DEB,Rheinland-Pfalz,200.94,4.135583e+06,2.929086e+06,,,,,,
3168,2546030700,DEB10010,./DEB/DEB10010/DEB10010_data.csv,DEB,Rheinland-Pfalz,598.31,4.139014e+06,2.946132e+06,,,,,,
3169,2546040900,DEB10020,./DEB/DEB10020/DEB10020_data.csv,DEB,Rheinland-Pfalz,1088.17,4.151599e+06,2.955622e+06,,,,,,
3170,2546052200,DEB10030,./DEB/DEB10030/DEB10030_data.csv,DEB,Rheinland-Pfalz,34.5,4.131066e+06,2.926551e+06,,,,,,
3171,2546057700,DEB10040,./DEB/DEB10040/DEB10040_data.csv,DEB,Rheinland-Pfalz,19.44,4.151160e+06,2.931848e+06,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3317,2716025700,DEB11500,./DEB/DEB11500/DEB11500_data.csv,DEB,Rheinland-Pfalz,,inf,inf,,,,,,
3318,2716050800,DEB11510,./DEB/DEB11510/DEB11510_data.csv,DEB,Rheinland-Pfalz,71.8,4.157039e+06,3.050256e+06,,,,,,
3319,2716055200,DEB11520,./DEB/DEB11520/DEB11520_data.csv,DEB,Rheinland-Pfalz,,inf,inf,,,,,,
3320,2628036600,DEB11530,./DEB/DEB11530/DEB11530_data.csv,DEB,Rheinland-Pfalz,,1.910371e+06,3.122243e+06,,,,,,


## Add WGS84 coordinates

In [4]:
# create a transformer
transformer = Transformer.from_crs("EPSG:3035", "EPSG:4326", always_xy=True)

# transform
lon, lat = transformer.transform(metadata.x.values, metadata.y.values)

# add back
updates = pd.DataFrame({'camels_id': metadata.camels_id, 'lon': lon, 'lat': lat})
util.update_metadata(new_metadata=updates)

metadata = util.get_metadata()
metadata[metadata['nuts_lvl2'] == 'DEB']

Unnamed: 0,camels_id,provider_id,camels_path,nuts_lvl2,federal_state,area,x,y,lon,lat,q_count,w_count,q_w_pearson,q_w_spearman
3167,DEB10000,2546015800,./DEB/DEB10000/DEB10000_data.csv,DEB,Rheinland-Pfalz,200.94,4.135583e+06,2.929086e+06,7.443029,49.445642,,,,
3168,DEB10010,2546030700,./DEB/DEB10010/DEB10010_data.csv,DEB,Rheinland-Pfalz,598.31,4.139014e+06,2.946132e+06,7.482398,49.599891,,,,
3169,DEB10020,2546040900,./DEB/DEB10020/DEB10020_data.csv,DEB,Rheinland-Pfalz,1088.17,4.151599e+06,2.955622e+06,7.652250,49.688918,,,,
3170,DEB10030,2546052200,./DEB/DEB10030/DEB10030_data.csv,DEB,Rheinland-Pfalz,34.5,4.131066e+06,2.926551e+06,7.382015,49.421427,,,,
3171,DEB10040,2546057700,./DEB/DEB10040/DEB10040_data.csv,DEB,Rheinland-Pfalz,19.44,4.151160e+06,2.931848e+06,7.656510,49.475124,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3317,DEB11500,2716025700,./DEB/DEB11500/DEB11500_data.csv,DEB,Rheinland-Pfalz,,inf,inf,inf,inf,,,,
3318,DEB11510,2716050800,./DEB/DEB11510/DEB11510_data.csv,DEB,Rheinland-Pfalz,71.8,4.157039e+06,3.050256e+06,7.686723,50.540852,,,,
3319,DEB11520,2716055200,./DEB/DEB11520/DEB11520_data.csv,DEB,Rheinland-Pfalz,,inf,inf,inf,inf,,,,
3320,DEB11530,2628036600,./DEB/DEB11530/DEB11530_data.csv,DEB,Rheinland-Pfalz,,1.910371e+06,3.122243e+06,-22.462095,46.346123,,,,


## Count existing data

Go for each file and count the available data. Add anything that makes it necessary to read every single data file into the for-loop below.

Checking columns:

* `'q'` 
* `'w'`

In [5]:
for NUTS in util._NUTS_LVL2_NAMES.keys():
    # empty container for this BL
    count_q = []
    count_w = []
    
    # process this federal state
    with Bundesland(NUTS) as bl:
        # get meta
        meta = bl.metadata

        # go for each id
        for camels_id in tqdm(meta.camels_id.values, desc=NUTS):
            # load the data
            try:
                df = bl.get_data(camels_id)
            except FileNotFoundError:
                count_q.append(0)
                count_w.append(0)
                continue
    
            # check q
            if 'q' in df.columns.values:
                count_q.append((~df.q.isna()).count())
            else:
                count_q.append(0)

            # check w
            if 'w' in df.columns.values:
                count_w.append((~df.w.isna()).count())
            else:
                count_w.append(0)

        # build the new metadata
        counts = pd.DataFrame({'camels_id': meta.camels_id.values, 'q_count': np.asarray(count_q, dtype=int), 'w_count': np.asarray(count_w, dtype=int)})

        # add to metadata
        bl.update_metadata(counts)

metadata = util.get_metadata()
metadata[metadata['nuts_lvl2'] == 'DEB']

DE1:   0%|          | 0/259 [00:00<?, ?it/s]

DE1: 100%|██████████| 259/259 [00:05<00:00, 51.32it/s]
DE2: 100%|██████████| 540/540 [00:13<00:00, 38.91it/s]
DE3: 0it [00:00, ?it/s]
DE4: 100%|██████████| 233/233 [00:05<00:00, 44.24it/s]
DE5: 0it [00:00, ?it/s]
DE6: 0it [00:00, ?it/s]
DE7: 100%|██████████| 97/97 [00:02<00:00, 40.88it/s]
DE8: 100%|██████████| 235/235 [00:03<00:00, 77.19it/s]
DE9: 100%|██████████| 282/282 [00:03<00:00, 79.87it/s]
DEA: 100%|██████████| 219/219 [00:04<00:00, 47.47it/s]
DEB: 100%|██████████| 155/155 [00:02<00:00, 58.99it/s]
DEC: 100%|██████████| 56/56 [00:00<00:00, 64.73it/s]
DED: 100%|██████████| 282/282 [00:05<00:00, 53.50it/s]
DEE: 100%|██████████| 126/126 [00:02<00:00, 42.34it/s]
DEF: 100%|██████████| 775/775 [00:06<00:00, 121.66it/s]
DEG: 100%|██████████| 63/63 [00:01<00:00, 36.03it/s]


Unnamed: 0,camels_id,provider_id,camels_path,nuts_lvl2,federal_state,area,x,y,lon,lat,q_count,w_count,q_w_pearson,q_w_spearman
3167,DEB10000,2546015800,./DEB/DEB10000/DEB10000_data.csv,DEB,Rheinland-Pfalz,200.94,4.135583e+06,2.929086e+06,7.443029,49.445642,15388.0,15388.0,,
3168,DEB10010,2546030700,./DEB/DEB10010/DEB10010_data.csv,DEB,Rheinland-Pfalz,598.31,4.139014e+06,2.946132e+06,7.482398,49.599891,20907.0,20907.0,,
3169,DEB10020,2546040900,./DEB/DEB10020/DEB10020_data.csv,DEB,Rheinland-Pfalz,1088.17,4.151599e+06,2.955622e+06,7.652250,49.688918,25048.0,25048.0,,
3170,DEB10030,2546052200,./DEB/DEB10030/DEB10030_data.csv,DEB,Rheinland-Pfalz,34.5,4.131066e+06,2.926551e+06,7.382015,49.421427,0.0,0.0,,
3171,DEB10040,2546057700,./DEB/DEB10040/DEB10040_data.csv,DEB,Rheinland-Pfalz,19.44,4.151160e+06,2.931848e+06,7.656510,49.475124,0.0,0.0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3317,DEB11500,2716025700,./DEB/DEB11500/DEB11500_data.csv,DEB,Rheinland-Pfalz,,inf,inf,inf,inf,0.0,0.0,,
3318,DEB11510,2716050800,./DEB/DEB11510/DEB11510_data.csv,DEB,Rheinland-Pfalz,71.8,4.157039e+06,3.050256e+06,7.686723,50.540852,0.0,0.0,,
3319,DEB11520,2716055200,./DEB/DEB11520/DEB11520_data.csv,DEB,Rheinland-Pfalz,,inf,inf,inf,inf,0.0,0.0,,
3320,DEB11530,2628036600,./DEB/DEB11530/DEB11530_data.csv,DEB,Rheinland-Pfalz,,1.910371e+06,3.122243e+06,-22.462095,46.346123,0.0,0.0,,


## Add W ~ Q correlations 

The data reports contain correlations between all data variables. We can extract the pearson's correlation coefficient or the spearman rank correlation and add to the preliminary metadata file

In [6]:
util.get_metadata()

Unnamed: 0,camels_id,provider_id,camels_path,nuts_lvl2,federal_state,area,x,y,lon,lat,q_count,w_count,q_w_pearson,q_w_spearman
0,DE410000,5860200,./DE4/DE410000/DE410000_data.csv,DE4,Brandenburg,68.88,4.583855e+06,3.281197e+06,13.879427,52.576841,0.0,18930.0,,
1,DE410010,5873101,./DE4/DE410010/DE410010_data.csv,DE4,Brandenburg,226.69,4.507876e+06,3.232843e+06,12.732362,52.173797,21021.0,21021.0,0.699270,0.653806
2,DE410020,5896202,./DE4/DE410020/DE410020_data.csv,DE4,Brandenburg,211.16,4.487561e+06,3.342248e+06,12.490668,53.162802,14597.0,14597.0,0.889058,0.872906
3,DE410030,6602800,./DE4/DE410030/DE410030_data.csv,DE4,Brandenburg,4064.98,4.644224e+06,3.217008e+06,14.706919,51.969154,22998.0,22998.0,0.868307,0.897783
4,DE410040,5856800,./DE4/DE410040/DE410040_data.csv,DE4,Brandenburg,105.11,4.560622e+06,3.245010e+06,13.511195,52.262716,0.0,17059.0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3317,DEB11500,2716025700,./DEB/DEB11500/DEB11500_data.csv,DEB,Rheinland-Pfalz,,inf,inf,inf,inf,0.0,0.0,,
3318,DEB11510,2716050800,./DEB/DEB11510/DEB11510_data.csv,DEB,Rheinland-Pfalz,71.8,4.157039e+06,3.050256e+06,7.686723,50.540852,0.0,0.0,,
3319,DEB11520,2716055200,./DEB/DEB11520/DEB11520_data.csv,DEB,Rheinland-Pfalz,,inf,inf,inf,inf,0.0,0.0,,
3320,DEB11530,2628036600,./DEB/DEB11530/DEB11530_data.csv,DEB,Rheinland-Pfalz,,1.910371e+06,3.122243e+06,-22.462095,46.346123,0.0,0.0,,


In [7]:
for NUTS in util._NUTS_LVL2_NAMES.keys():    
    # process this federal state
    with Bundesland(NUTS) as bl:    
        pearson = []
        spearman = []

        # get the metadata
        meta = bl.metadata

        # load the Data-report for each
        for camels_id in tqdm(meta.camels_id.values, desc=NUTS):
            p = os.path.join(bl.base_path, 'reports', f'{camels_id}.json')
            
            # check if the report exists
            if not os.path.exists(p):
                pearson.append(None)
                spearman.append(None)
                continue
            
            with open(p, 'r') as f:
                report = json.load(f)
            
            try:
                q = [o for o in report['correlations']['pearson'] if o['q'] == 1.0][0]
                w = [o for o in report['correlations']['pearson'] if o['w'] == 1.0][0]
                pearson.append(q['w'])
            except:
                pearson.append(None)
            
            try:
                q = [o for o in report['correlations']['spearman'] if o['q'] == 1.0][0]
                w = [o for o in report['correlations']['spearman'] if o['w'] == 1.0][0]
                spearman.append(q['w'])
            except:
                spearman.append(None)
        
        # all collected, return now
        corrs = pd.DataFrame({'camels_id': meta.camels_id.values, 'q_w_pearson': pearson, 'q_w_spearman': spearman})

        # update
        bl.update_metadata(corrs)

metadata = util.get_metadata()
metadata[metadata['nuts_lvl2'] == 'DEB']

DE1:   0%|          | 0/259 [00:00<?, ?it/s]

DE1: 100%|██████████| 259/259 [00:02<00:00, 108.87it/s]
DE2: 100%|██████████| 540/540 [00:06<00:00, 80.69it/s]
DE3: 0it [00:00, ?it/s]
DE4: 100%|██████████| 233/233 [00:02<00:00, 115.87it/s]
DE5: 0it [00:00, ?it/s]
DE6: 0it [00:00, ?it/s]
DE7: 100%|██████████| 97/97 [00:01<00:00, 94.27it/s] 
DE8: 100%|██████████| 235/235 [00:01<00:00, 161.45it/s]
DE9: 100%|██████████| 282/282 [00:02<00:00, 129.06it/s]
DEA: 100%|██████████| 219/219 [00:03<00:00, 63.25it/s]
DEB: 100%|██████████| 155/155 [00:01<00:00, 127.08it/s]
DEC: 100%|██████████| 56/56 [00:00<00:00, 129.44it/s]
DED: 100%|██████████| 282/282 [00:01<00:00, 143.98it/s]
DEE: 100%|██████████| 126/126 [00:01<00:00, 85.33it/s]
DEF: 100%|██████████| 775/775 [00:02<00:00, 307.31it/s]
DEG: 100%|██████████| 63/63 [00:00<00:00, 73.80it/s]


Unnamed: 0,camels_id,provider_id,camels_path,nuts_lvl2,federal_state,area,x,y,lon,lat,q_count,w_count,q_w_pearson,q_w_spearman
3167,DEB10000,2546015800,./DEB/DEB10000/DEB10000_data.csv,DEB,Rheinland-Pfalz,200.94,4.135583e+06,2.929086e+06,7.443029,49.445642,15388.0,15388.0,0.963048,0.980030
3168,DEB10010,2546030700,./DEB/DEB10010/DEB10010_data.csv,DEB,Rheinland-Pfalz,598.31,4.139014e+06,2.946132e+06,7.482398,49.599891,20907.0,20907.0,0.916515,0.900342
3169,DEB10020,2546040900,./DEB/DEB10020/DEB10020_data.csv,DEB,Rheinland-Pfalz,1088.17,4.151599e+06,2.955622e+06,7.652250,49.688918,25048.0,25048.0,0.906273,0.813526
3170,DEB10030,2546052200,./DEB/DEB10030/DEB10030_data.csv,DEB,Rheinland-Pfalz,34.5,4.131066e+06,2.926551e+06,7.382015,49.421427,0.0,0.0,,
3171,DEB10040,2546057700,./DEB/DEB10040/DEB10040_data.csv,DEB,Rheinland-Pfalz,19.44,4.151160e+06,2.931848e+06,7.656510,49.475124,0.0,0.0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3317,DEB11500,2716025700,./DEB/DEB11500/DEB11500_data.csv,DEB,Rheinland-Pfalz,,inf,inf,inf,inf,0.0,0.0,,
3318,DEB11510,2716050800,./DEB/DEB11510/DEB11510_data.csv,DEB,Rheinland-Pfalz,71.8,4.157039e+06,3.050256e+06,7.686723,50.540852,0.0,0.0,,
3319,DEB11520,2716055200,./DEB/DEB11520/DEB11520_data.csv,DEB,Rheinland-Pfalz,,inf,inf,inf,inf,0.0,0.0,,
3320,DEB11530,2628036600,./DEB/DEB11530/DEB11530_data.csv,DEB,Rheinland-Pfalz,,1.910371e+06,3.122243e+06,-22.462095,46.346123,0.0,0.0,,


In [16]:
from glob import glob

# empty_data = []
# empty_files = []

for NUTS in util._NUTS_LVL2_NAMES.keys():

    bl = Bundesland(NUTS)

    print(NUTS)

#     # empty data
#     for id in metadata['camels_id'].values:
#         try:
#             df = bl.get_data(id)
#             if len(df) == 0:
#                 empty_data.append(id)
#         except FileNotFoundError:
#             empty_files.append(id)

    print(f"{len(bl.metadata[(bl.metadata['q_count'] == 0) & (bl.metadata['w_count'] == 0)])}")

    # empty folders
    all_folders = glob(f"{bl.base_path}/{NUTS}/*")

    print("Empty folders:")
    print(f"{len([folder for folder in all_folders if len(os.listdir(folder)) == 0])}\n")

DE1
7
Empty folders:
0

DE2
5
Empty folders:
0

DE3
0
Empty folders:
0

DE4
0
Empty folders:
0

DE5
0
Empty folders:
0

DE6
0
Empty folders:
0

DE7
0
Empty folders:
0

DE8
5
Empty folders:
0

DE9
0
Empty folders:
0

DEA
0
Empty folders:
0

DEB
31
Empty folders:
0

DEC
10
Empty folders:
0

DED
0
Empty folders:
0

DEE
0
Empty folders:
0

DEF
266
Empty folders:
170

DEG
0
Empty folders:
0



In [11]:
len(Bundesland('DE4').get_data('DE410000'))

18930

In [13]:
bl.get_data('DE410000')

AttributeError: 'DataFrame' object has no attribute 'provider_id'

In [9]:
empty_files

['DE410000',
 'DE410010',
 'DE410020',
 'DE410030',
 'DE410040',
 'DE410050',
 'DE410060',
 'DE410070',
 'DE410080',
 'DE410090',
 'DE410100',
 'DE410110',
 'DE410120',
 'DE410130',
 'DE410140',
 'DE410150',
 'DE410160',
 'DE410170',
 'DE410180',
 'DE410190',
 'DE410200',
 'DE410210',
 'DE410220',
 'DE410230',
 'DE410240',
 'DE410250',
 'DE410260',
 'DE410270',
 'DE410280',
 'DE410290',
 'DE410300',
 'DE410310',
 'DE410320',
 'DE410330',
 'DE410340',
 'DE410350',
 'DE410360',
 'DE410370',
 'DE410380',
 'DE410390',
 'DE410400',
 'DE410410',
 'DE410420',
 'DE410430',
 'DE410440',
 'DE410450',
 'DE410460',
 'DE410470',
 'DE410480',
 'DE410490',
 'DE410500',
 'DE410510',
 'DE410520',
 'DE410530',
 'DE410540',
 'DE410550',
 'DE410560',
 'DE410570',
 'DE410580',
 'DE410590',
 'DE410600',
 'DE410610',
 'DE410620',
 'DE410630',
 'DE410640',
 'DE410650',
 'DE410660',
 'DE410670',
 'DE410680',
 'DE410690',
 'DE410700',
 'DE410710',
 'DE410720',
 'DE410730',
 'DE410740',
 'DE410750',
 'DE410760',