# Data Reports

use pandas-profiling to generate data reports.  
Also adds W~Q correlations to metadata.csv

In [1]:
import os
from tqdm import tqdm
import json
import warnings
import pandas as pd

from camelsp import Bundesland, util

First load all NUTS folders found in the output dir

In [2]:
nuts = sorted([nt for nt in os.listdir(Bundesland('Hessen').base_path) if nt.startswith('DE')])
nuts

['DE1',
 'DE2',
 'DE4',
 'DE7',
 'DE8',
 'DE9',
 'DEA',
 'DEB',
 'DEC',
 'DED',
 'DEE',
 'DEF',
 'DEG']

Und jetzt gib ihm.

In [3]:
# set to true, if new output data was added (ie. rainfall)
REPLACE = False

# create for each report
for ID in nuts:
    with Bundesland(ID) as bl:
        with warnings.catch_warnings(record=True) as warn:
            nuts_ids = bl.nuts_table.nuts_id.values.tolist()

            for nuts_id in tqdm(nuts_ids, desc=ID):
                # write the report files
                try:
                    bl.generate_reports(nuts_ids=nuts_id, fmt='html', if_exists='replace' if REPLACE else 'omit')
                    bl.generate_reports(nuts_ids=nuts_id, fmt='json', if_exists='replace' if REPLACE else 'omit')
                except Exception as e:
                    print(str(e))
                    warnings.warn(str(e))

            if len(warn) > 0:
                print(f"FutureWarnings: {len([w for w in warn if w.category == FutureWarning])}")
                print(f"Other warnings: {len([w for w in warn if w.category != FutureWarning])} (possibly missing data files).")

DE1: 100%|██████████| 252/252 [00:00<00:00, 6337.71it/s]
DE2: 100%|██████████| 535/535 [00:00<00:00, 5433.87it/s]
DE4: 100%|██████████| 233/233 [00:00<00:00, 6366.10it/s]
DE7: 100%|██████████| 97/97 [00:00<00:00, 6924.83it/s]
DE8: 100%|██████████| 230/230 [00:00<00:00, 5356.89it/s]
DE9: 100%|██████████| 294/294 [00:00<00:00, 6071.46it/s]
DEA: 100%|██████████| 219/219 [00:00<00:00, 6663.81it/s]
DEB: 100%|██████████| 124/124 [00:00<00:00, 7263.07it/s]
DEC: 100%|██████████| 46/46 [00:00<00:00, 5582.70it/s]
DED: 100%|██████████| 282/282 [00:00<00:00, 5530.38it/s]
DEE: 100%|██████████| 126/126 [00:00<00:00, 5372.94it/s]
DEF: 100%|██████████| 509/509 [00:00<00:00, 6419.85it/s]
DEG: 100%|██████████| 63/63 [00:00<00:00, 6707.48it/s]


## Add W ~ Q correlations to metadata

The data reports contain correlations between all data variables. We can extract the pearson's correlation coefficient or the spearman rank correlation and add to the preliminary metadata file

In [4]:
util.get_metadata()

Unnamed: 0,camels_id,provider_id,camels_path,nuts_lvl2,federal_state,gauge_name,waterbody_name,gauge_elevation,area,x,y,lon,lat,q_count,w_count,q_extent_years,w_extent_years
0,DEE10000,440003,./DEE/DEE10000/DEE10000_data.csv,DEE,Sachsen-Anhalt,Ummendorf,Aller,124.900,52.300,4.402365e+06,3.227863e+06,11.188884,52.154576,10684.0,10684.0,29.268493,9.389041
1,DEE10010,440004,./DEE/DEE10010/DEE10010_data.csv,DEE,Sachsen-Anhalt,Alleringersleben,Aller,113.240,142.000,4.398520e+06,3.234256e+06,11.134176,52.212572,28879.0,28879.0,51.361644,79.117808
2,DEE10020,440008,./DEE/DEE10020/DEE10020_data.csv,DEE,Sachsen-Anhalt,Walbeck,Aller,94.340,201.000,4.393928e+06,3.241476e+06,11.068552,52.278070,18355.0,18355.0,48.282192,50.284932
3,DEE10030,440010,./DEE/DEE10030/DEE10030_data.csv,DEE,Sachsen-Anhalt,Weferlingen,Aller,84.320,238.000,4.393036e+06,3.245503e+06,11.056336,52.314377,28951.0,28951.0,51.284932,79.315068
4,DEE10040,441201,./DEE/DEE10040/DEE10040_data.csv,DEE,Sachsen-Anhalt,Hödingen,Schölecke,93.970,23.000,4.396340e+06,3.243827e+06,11.104414,52.298873,15067.0,15067.0,29.268493,41.276712
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3005,DE112470,76273,./DE1/DE112470/DE112470_data.csv,DE1,Baden-Württemberg,Blaubeuren,Blautopf,511.871,0.067,4.304981e+06,2.811435e+06,9.783683,48.416174,30987.0,30987.0,65.380822,79.219178
3006,DE112480,76274,./DE1/DE112480/DE112480_data.csv,DE1,Baden-Württemberg,Erlenbach,Sulm,160.832,101.510,4.267341e+06,2.895499e+06,9.264333,49.170156,4140.0,4140.0,11.339726,11.339726
3007,DE112490,76276,./DE1/DE112490/DE112490_data.csv,DE1,Baden-Württemberg,Bolheim,Brenz,473.000,339.811,4.332390e+06,2.835208e+06,10.154468,48.630150,5555.0,5555.0,15.175342,15.216438
3008,DE112500,76290,./DE1/DE112500/DE112500_data.csv,DE1,Baden-Württemberg,Schweinhausen,Riß,541.098,101.589,4.305130e+06,2.769259e+06,9.787300,48.036705,8342.0,8342.0,22.852055,22.852055


In [5]:
for NUTS in util._NUTS_LVL2_NAMES.keys():    
    # process this federal state
    with Bundesland(NUTS) as bl:    
        pearson = []
        spearman = []

        # get the metadata
        meta = bl.metadata

        # load the Data-report for each
        for camels_id in tqdm(meta.camels_id.values, desc=NUTS):
            p = os.path.join(bl.base_path, 'reports', f'{camels_id}.json')
            
            # check if the report exists
            if not os.path.exists(p):
                pearson.append(None)
                spearman.append(None)
                continue
            
            with open(p, 'r') as f:
                report = json.load(f)
            
            try:
                q = [o for o in report['correlations']['pearson'] if o['q'] == 1.0][0]
                w = [o for o in report['correlations']['pearson'] if o['w'] == 1.0][0]
                pearson.append(q['w'])
            except:
                pearson.append(None)
            
            try:
                q = [o for o in report['correlations']['spearman'] if o['q'] == 1.0][0]
                w = [o for o in report['correlations']['spearman'] if o['w'] == 1.0][0]
                spearman.append(q['w'])
            except:
                spearman.append(None)
        
        # all collected, return now
        corrs = pd.DataFrame({'camels_id': meta.camels_id.values, 'q_w_pearson': pearson, 'q_w_spearman': spearman})

        # update
        bl.update_metadata(corrs)

metadata = util.get_metadata()
metadata

DE1:   0%|          | 0/252 [00:00<?, ?it/s]

DE1: 100%|██████████| 252/252 [00:09<00:00, 25.91it/s]
DE2: 100%|██████████| 535/535 [00:29<00:00, 18.04it/s]
DE3: 0it [00:00, ?it/s]
DE4: 100%|██████████| 233/233 [00:09<00:00, 24.38it/s]
DE5: 0it [00:00, ?it/s]
DE6: 0it [00:00, ?it/s]
DE7: 100%|██████████| 97/97 [00:04<00:00, 20.01it/s]
DE8: 100%|██████████| 230/230 [00:07<00:00, 32.42it/s]
DE9: 100%|██████████| 294/294 [00:12<00:00, 23.78it/s]
DEA: 100%|██████████| 219/219 [00:15<00:00, 14.05it/s]
DEB: 100%|██████████| 124/124 [00:06<00:00, 20.17it/s]
DEC: 100%|██████████| 46/46 [00:02<00:00, 18.95it/s]
DED: 100%|██████████| 282/282 [00:10<00:00, 27.62it/s]
DEE: 100%|██████████| 126/126 [00:07<00:00, 16.79it/s]
DEF: 100%|██████████| 509/509 [00:12<00:00, 40.33it/s]
DEG: 100%|██████████| 63/63 [00:04<00:00, 14.64it/s]


Unnamed: 0,camels_id,provider_id,camels_path,nuts_lvl2,federal_state,gauge_name,waterbody_name,gauge_elevation,area,x,y,lon,lat,q_count,w_count,q_extent_years,w_extent_years,q_w_pearson,q_w_spearman
0,DEE10000,440003,./DEE/DEE10000/DEE10000_data.csv,DEE,Sachsen-Anhalt,Ummendorf,Aller,124.900,52.300,4.402365e+06,3.227863e+06,11.188884,52.154576,10684.0,10684.0,29.268493,9.389041,0.537862,0.581002
1,DEE10010,440004,./DEE/DEE10010/DEE10010_data.csv,DEE,Sachsen-Anhalt,Alleringersleben,Aller,113.240,142.000,4.398520e+06,3.234256e+06,11.134176,52.212572,28879.0,28879.0,51.361644,79.117808,0.861575,0.851983
2,DEE10020,440008,./DEE/DEE10020/DEE10020_data.csv,DEE,Sachsen-Anhalt,Walbeck,Aller,94.340,201.000,4.393928e+06,3.241476e+06,11.068552,52.278070,18355.0,18355.0,48.282192,50.284932,0.941525,0.895263
3,DEE10030,440010,./DEE/DEE10030/DEE10030_data.csv,DEE,Sachsen-Anhalt,Weferlingen,Aller,84.320,238.000,4.393036e+06,3.245503e+06,11.056336,52.314377,28951.0,28951.0,51.284932,79.315068,0.825320,0.812189
4,DEE10040,441201,./DEE/DEE10040/DEE10040_data.csv,DEE,Sachsen-Anhalt,Hödingen,Schölecke,93.970,23.000,4.396340e+06,3.243827e+06,11.104414,52.298873,15067.0,15067.0,29.268493,41.276712,0.727779,0.819505
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3005,DE112470,76273,./DE1/DE112470/DE112470_data.csv,DE1,Baden-Württemberg,Blaubeuren,Blautopf,511.871,0.067,4.304981e+06,2.811435e+06,9.783683,48.416174,30987.0,30987.0,65.380822,79.219178,0.535678,0.634351
3006,DE112480,76274,./DE1/DE112480/DE112480_data.csv,DE1,Baden-Württemberg,Erlenbach,Sulm,160.832,101.510,4.267341e+06,2.895499e+06,9.264333,49.170156,4140.0,4140.0,11.339726,11.339726,0.931055,0.994364
3007,DE112490,76276,./DE1/DE112490/DE112490_data.csv,DE1,Baden-Württemberg,Bolheim,Brenz,473.000,339.811,4.332390e+06,2.835208e+06,10.154468,48.630150,5555.0,5555.0,15.175342,15.216438,0.997335,0.999000
3008,DE112500,76290,./DE1/DE112500/DE112500_data.csv,DE1,Baden-Württemberg,Schweinhausen,Riß,541.098,101.589,4.305130e+06,2.769259e+06,9.787300,48.036705,8342.0,8342.0,22.852055,22.852055,0.976192,0.987846
