# Merge Metadata

This script can be used to build the final Metadata file.

There are several notes that are important:

* Some of the location files have been processed externally and the workflow needs to be explained here
* We should note done all CRS transformations applied for reference

In [1]:
import os
import json
import pandas as pd
import numpy as np
from tqdm import tqdm
from pyproj.transformer import Transformer

from camelsp import Bundesland, util

As an example: The `Bundesland` context manager can load the metadata for the given Bundesland only from the full metadata table. If this table does not yet exist, it is created from the NUTSID mapping table. Check out for Saarland:

In [2]:
with Bundesland('DE8') as bl:
    dec_meta = bl.metadata

dec_meta.head()

Unnamed: 0,camels_id,provider_id,camels_path,nuts_lvl2,federal_state
2529,DE810000,4341.0,./DE8/DE810000/DE810000_data.csv,DE8,Mecklenburg Vorpommern
2530,DE810010,4341.1,./DE8/DE810010/DE810010_data.csv,DE8,Mecklenburg Vorpommern
2531,DE810020,4342.0,./DE8/DE810020/DE810020_data.csv,DE8,Mecklenburg Vorpommern
2532,DE810030,4343.0,./DE8/DE810030/DE810030_data.csv,DE8,Mecklenburg Vorpommern
2533,DE810040,4344.0,./DE8/DE810040/DE810040_data.csv,DE8,Mecklenburg Vorpommern


## Generate basic metadata

This step will produce one metadata file containing all processed data, which can be used as NUTS lookup and as a basis to add more specific metadata.
The first step also loads the Location files and merges everything

In [3]:
for NUTS in tqdm(util._NUTS_LVL2_NAMES.keys()):
    with Bundesland(NUTS) as bl:
        try:
            p = os.path.join(bl.base_path, 'locations', f'{bl.NUTS}_Locations.csv')
            # read in 
            df = pd.read_csv(p)
            df.columns = ['provider_id', 'area', 'x', 'y']
            #df.drop_duplicates(inplace=True)
        except FileNotFoundError:
            continue
        
        # update by simply setting the new metadata to the property setter
        # in this case, the joining column needs to be 'camels_id' or 'provider_id'
        #bl.metadata = df

        # or use the function if you prefer
        bl.update_metadata(df, id_column='provider_id')

metadata = util.get_metadata()
metadata

100%|██████████| 16/16 [00:00<00:00, 36.23it/s]


Unnamed: 0,provider_id,camels_id,camels_path,nuts_lvl2,federal_state,area,x,y
0,165666.0,DE410000,./DE4/DE410000/DE410000_data.csv,DE4,Brandenburg,533.24,4.511477e+06,3.320481e+06
1,278150.0,DE410010,./DE4/DE410010/DE410010_data.csv,DE4,Brandenburg,118.21,4.573204e+06,3.279801e+06
2,166379.0,DE410020,./DE4/DE410020/DE410020_data.csv,DE4,Brandenburg,1792.07,4.533215e+06,3.254909e+06
3,166369.0,DE410030,./DE4/DE410030/DE410030_data.csv,DE4,Brandenburg,3154.03,4.556514e+06,3.161963e+06
4,165980.0,DE410040,./DE4/DE410040/DE410040_data.csv,DE4,Brandenburg,284.62,4.451184e+06,3.318162e+06
...,...,...,...,...,...,...,...,...
3655,56113404.0,DE215350,./DE2/DE215350/DE215350_data.csv,DE2,Bayern,8.20,4.456659e+06,3.009715e+06
3656,56114000.0,DE215360,./DE2/DE215360/DE215360_data.csv,DE2,Bayern,14.10,4.457008e+06,3.011665e+06
3657,56122008.0,DE215370,./DE2/DE215370/DE215370_data.csv,DE2,Bayern,84.30,4.465052e+06,3.016793e+06
3658,56143008.0,DE215380,./DE2/DE215380/DE215380_data.csv,DE2,Bayern,92.40,4.462157e+06,3.021202e+06


## Add WGS84 coordinates

In [4]:
# create a transformer
transformer = Transformer.from_crs("EPSG:3035", "EPSG:4326", always_xy=True)

# transform
lon, lat = transformer.transform(metadata.x.values, metadata.y.values)

# add back
updates = pd.DataFrame({'camels_id': metadata.camels_id, 'lon': lon, 'lat': lat})
util.update_metadata(new_metadata=updates)

metadata = util.get_metadata()
metadata

Unnamed: 0,camels_id,provider_id,camels_path,nuts_lvl2,federal_state,area,x,y,lon,lat
0,DE410000,165666.0,./DE4/DE410000/DE410000_data.csv,DE4,Brandenburg,533.24,4.511477e+06,3.320481e+06,12.835157,52.959469
1,DE410010,278150.0,./DE4/DE410010/DE410010_data.csv,DE4,Brandenburg,118.21,4.573204e+06,3.279801e+06,13.721427,52.569315
2,DE410020,166379.0,./DE4/DE410020/DE410020_data.csv,DE4,Brandenburg,1792.07,4.533215e+06,3.254909e+06,13.116304,52.362773
3,DE410030,166369.0,./DE4/DE410030/DE410030_data.csv,DE4,Brandenburg,3154.03,4.556514e+06,3.161963e+06,13.394420,51.519043
4,DE410040,165980.0,./DE4/DE410040/DE410040_data.csv,DE4,Brandenburg,284.62,4.451184e+06,3.318162e+06,11.937284,52.956437
...,...,...,...,...,...,...,...,...,...,...
3655,DE215350,56113404.0,./DE2/DE215350/DE215350_data.csv,DE2,Bayern,8.20,4.456659e+06,3.009715e+06,11.899471,50.183855
3656,DE215360,56114000.0,./DE2/DE215360/DE215360_data.csv,DE2,Bayern,14.10,4.457008e+06,3.011665e+06,11.905065,50.201292
3657,DE215370,56122008.0,./DE2/DE215370/DE215370_data.csv,DE2,Bayern,84.30,4.465052e+06,3.016793e+06,12.019640,50.245443
3658,DE215380,56143008.0,./DE2/DE215380/DE215380_data.csv,DE2,Bayern,92.40,4.462157e+06,3.021202e+06,11.980721,50.285782


## Count existing data

Go for each file and count the available data. Add anything that makes it necessary to read every single data file into the for-loop below.

Checking columns:

* `'q'` 
* `'w'`

In [21]:
for NUTS in util._NUTS_LVL2_NAMES.keys():
    # empty container for this BL
    count_q = []
    count_w = []
    
    # process this federal state
    with Bundesland(NUTS) as bl:
        # get meta
        meta = bl.metadata

        # go for each id
        for camels_id in tqdm(meta.camels_id.values):
            # load the data
            try:
                df = bl.get_data(camels_id)
            except FileNotFoundError:
                count_q.append(0)
                count_w.append(0)
                continue
    
            # check q
            if 'q' in df.columns.values:
                count_q.append((~df.q.isna()).count())
            else:
                count_q.append(0)

            # check w
            if 'w' in df.columns.values:
                count_w.append((~df.w.isna()).count())
            else:
                count_w.append(0)

        # build the new metadata
        counts = pd.DataFrame({'camels_id': meta.camels_id.values, 'q_count': np.asarray(count_q, dtype=int), 'w_count': np.asarray(count_w, dtype=int)})

        # add to metadata
        bl.update_metadata(counts)

metadata = util.get_metadata()
metadata

100%|██████████| 259/259 [00:04<00:00, 60.62it/s]
100%|██████████| 540/540 [00:10<00:00, 52.07it/s]
0it [00:00, ?it/s]
  df = pd.read_csv(path, parse_dates=['date'])
100%|██████████| 382/382 [00:05<00:00, 74.95it/s] 
0it [00:00, ?it/s]
0it [00:00, ?it/s]
100%|██████████| 97/97 [00:01<00:00, 51.38it/s]
100%|██████████| 235/235 [00:03<00:00, 78.25it/s]
100%|██████████| 282/282 [00:03<00:00, 70.60it/s]
100%|██████████| 437/437 [00:05<00:00, 84.27it/s] 
0it [00:00, ?it/s]
100%|██████████| 56/56 [00:00<00:00, 65.54it/s]
100%|██████████| 282/282 [00:03<00:00, 73.59it/s]
100%|██████████| 252/252 [00:03<00:00, 73.56it/s] 
100%|██████████| 775/775 [00:06<00:00, 124.85it/s]
100%|██████████| 63/63 [00:01<00:00, 42.84it/s]


Unnamed: 0,camels_id,provider_id,camels_path,nuts_lvl2,federal_state,area,x,y,q_count,w_count
0,DEG10000,573000,./DEG/DEG10000/DEG10000_data.csv,DEG,Thüringen,182.7,4.352221e+06,3124617.000,29646.0,29646.0
1,DEG10010,447000,./DEG/DEG10010/DEG10010_data.csv,DEG,Thüringen,275.0,4.318941e+06,3140875.000,0.0,0.0
2,DEG10020,574200,./DEG/DEG10020/DEG10020_data.csv,DEG,Thüringen,174.7,4.386764e+06,3077926.000,35490.0,35490.0
3,DEG10030,576500,./DEG/DEG10030/DEG10030_data.csv,DEG,Thüringen,1383.0,4.473276e+06,3073272.000,12845.0,12845.0
4,DEG10040,570210,./DEG/DEG10040/DEG10040_data.csv,DEG,Thüringen,1013.0,4.442190e+06,3033884.000,21246.0,21246.0
...,...,...,...,...,...,...,...,...,...,...
3655,DE215350,56113404,./DE2/DE215350/DE215350_data.csv,DE2,Bayern,8.2,4.456659e+06,3009715.476,16497.0,16497.0
3656,DE215360,56114000,./DE2/DE215360/DE215360_data.csv,DE2,Bayern,14.1,4.457008e+06,3011664.641,20880.0,20880.0
3657,DE215370,56122008,./DE2/DE215370/DE215370_data.csv,DE2,Bayern,84.3,4.465052e+06,3016792.655,23072.0,23072.0
3658,DE215380,56143008,./DE2/DE215380/DE215380_data.csv,DE2,Bayern,92.4,4.462157e+06,3021202.262,23440.0,23440.0


## Add W ~ Q correlations 

The data reports contain correlations between all data variables. We can extract the pearson's correlation coefficient or the spearman rank correlation and add to the preliminary metadata file

In [6]:
util.get_metadata()

Unnamed: 0,camels_id,provider_id,camels_path,nuts_lvl2,federal_state,area,x,y,q_count,w_count,q_w_pearson,q_w_spearman
0,DEG10000,57300.0,./DEG/DEG10000/DEG10000_data.csv,DEG,Thüringen,,,,29646.0,29646.0,0.969240,0.976895
1,DEG10010,44700.0,./DEG/DEG10010/DEG10010_data.csv,DEG,Thüringen,,,,22707.0,22707.0,,
2,DEG10020,57420.0,./DEG/DEG10020/DEG10020_data.csv,DEG,Thüringen,,,,35490.0,35490.0,0.958767,0.962387
3,DEG10030,57650.0,./DEG/DEG10030/DEG10030_data.csv,DEG,Thüringen,,,,12845.0,12845.0,0.502141,0.553354
4,DEG10040,57021.0,./DEG/DEG10040/DEG10040_data.csv,DEG,Thüringen,,,,21246.0,21246.0,0.940139,0.977699
...,...,...,...,...,...,...,...,...,...,...,...,...
3655,DE215350,56113404,./DE2/DE215350/DE215350_data.csv,DE2,Bayern,8.2,4.456659e+06,3009715.476,16497.0,16497.0,0.558268,0.933178
3656,DE215360,56114000,./DE2/DE215360/DE215360_data.csv,DE2,Bayern,14.1,4.457008e+06,3011664.641,20880.0,20880.0,0.996458,0.985672
3657,DE215370,56122008,./DE2/DE215370/DE215370_data.csv,DE2,Bayern,84.3,4.465052e+06,3016792.655,23072.0,23072.0,0.399310,0.898170
3658,DE215380,56143008,./DE2/DE215380/DE215380_data.csv,DE2,Bayern,92.4,4.462157e+06,3021202.262,23440.0,23440.0,0.547284,0.871654


In [5]:
for NUTS in util._NUTS_LVL2_NAMES.keys():    
    # process this federal state
    with Bundesland(NUTS) as bl:    
        pearson = []
        spearman = []

        # get the metadata
        meta = bl.metadata

        # load the Data-report for each
        for camels_id in tqdm(meta.camels_id.values):
            p = os.path.join(bl.base_path, 'reports', f'{camels_id}.json')
            
            # check if the report exists
            if not os.path.exists(p):
                pearson.append(None)
                spearman.append(None)
                continue
            
            with open(p, 'r') as f:
                report = json.load(f)
            
            try:
                q = [o for o in report['correlations']['pearson'] if o['q'] == 1.0][0]
                w = [o for o in report['correlations']['pearson'] if o['w'] == 1.0][0]
                pearson.append(q['w'])
            except:
                pearson.append(None)
            
            try:
                q = [o for o in report['correlations']['spearman'] if o['q'] == 1.0][0]
                w = [o for o in report['correlations']['spearman'] if o['w'] == 1.0][0]
                spearman.append(q['w'])
            except:
                spearman.append(None)
        
        # all collected, return now
        corrs = pd.DataFrame({'camels_id': meta.camels_id.values, 'q_w_pearson': pearson, 'q_w_spearman': spearman})

        # update
        bl.update_metadata(corrs)

metadata = util.get_metadata()
metadata

100%|██████████| 259/259 [00:13<00:00, 19.47it/s]
100%|██████████| 540/540 [00:39<00:00, 13.56it/s]
0it [00:00, ?it/s]
100%|██████████| 382/382 [00:14<00:00, 25.86it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
100%|██████████| 97/97 [00:06<00:00, 15.08it/s]
100%|██████████| 235/235 [00:10<00:00, 22.96it/s]
100%|██████████| 282/282 [00:13<00:00, 20.61it/s]
100%|██████████| 437/437 [00:16<00:00, 26.90it/s]
0it [00:00, ?it/s]
100%|██████████| 56/56 [00:02<00:00, 20.75it/s]
100%|██████████| 282/282 [00:12<00:00, 22.21it/s]
100%|██████████| 252/252 [00:08<00:00, 28.95it/s]
100%|██████████| 775/775 [00:15<00:00, 48.95it/s]
100%|██████████| 63/63 [00:04<00:00, 12.73it/s]


Unnamed: 0,camels_id,provider_id,camels_path,nuts_lvl2,federal_state,area,x,y,lon,lat,q_w_pearson,q_w_spearman
0,DE410000,165666.0,./DE4/DE410000/DE410000_data.csv,DE4,Brandenburg,533.24,4.511477e+06,3.320481e+06,12.835157,52.959469,0.227020,0.253190
1,DE410010,278150.0,./DE4/DE410010/DE410010_data.csv,DE4,Brandenburg,118.21,4.573204e+06,3.279801e+06,13.721427,52.569315,,
2,DE410020,166379.0,./DE4/DE410020/DE410020_data.csv,DE4,Brandenburg,1792.07,4.533215e+06,3.254909e+06,13.116304,52.362773,0.945349,0.953086
3,DE410030,166369.0,./DE4/DE410030/DE410030_data.csv,DE4,Brandenburg,3154.03,4.556514e+06,3.161963e+06,13.394420,51.519043,0.950655,0.925531
4,DE410040,165980.0,./DE4/DE410040/DE410040_data.csv,DE4,Brandenburg,284.62,4.451184e+06,3.318162e+06,11.937284,52.956437,0.513515,0.417356
...,...,...,...,...,...,...,...,...,...,...,...,...
3655,DE215350,56113404.0,./DE2/DE215350/DE215350_data.csv,DE2,Bayern,8.20,4.456659e+06,3.009715e+06,11.899471,50.183855,0.558268,0.933178
3656,DE215360,56114000.0,./DE2/DE215360/DE215360_data.csv,DE2,Bayern,14.10,4.457008e+06,3.011665e+06,11.905065,50.201292,0.996458,0.985672
3657,DE215370,56122008.0,./DE2/DE215370/DE215370_data.csv,DE2,Bayern,84.30,4.465052e+06,3.016793e+06,12.019640,50.245443,0.399310,0.898170
3658,DE215380,56143008.0,./DE2/DE215380/DE215380_data.csv,DE2,Bayern,92.40,4.462157e+06,3.021202e+06,11.980721,50.285782,0.547284,0.871654
