# Data Reports

use pandas-profiling to generate data reports.  
Also adds W~Q correlations to metadata.csv

In [1]:
import os
from tqdm import tqdm
import json
import warnings
from camelsp import Bundesland, util
import pandas as pd

First load all NUTS folders found in the output dir

In [2]:
nuts = sorted([nt for nt in os.listdir(Bundesland('Hessen').base_path) if nt.startswith('DE')])
nuts

['DE1',
 'DE2',
 'DE4',
 'DE7',
 'DE8',
 'DE9',
 'DEA',
 'DEB',
 'DEC',
 'DED',
 'DEE',
 'DEF',
 'DEG']

Und jetzt gib ihm.

In [3]:
# set to true, if new output data was added (ie. rainfall)
REPLACE = False

# create for each report
for ID in nuts:
    with Bundesland(ID) as bl:
        with warnings.catch_warnings(record=True) as warn:
            nuts_ids = bl.nuts_table.nuts_id.values.tolist()

            for nuts_id in tqdm(nuts_ids, desc=ID):
                # write the report files
                try:
                    bl.generate_reports(nuts_ids=nuts_id, fmt='html', if_exists='replace' if REPLACE else 'omit')
                    bl.generate_reports(nuts_ids=nuts_id, fmt='json', if_exists='replace' if REPLACE else 'omit')
                except Exception as e:
                    print(str(e))
                    warnings.warn(str(e))

            if len(warn) > 0:
                print(f"FutureWarnings: {len([w for w in warn if w.category == FutureWarning])}")
                print(f"Other warnings: {len([w for w in warn if w.category != FutureWarning])} (possibly missing data files).")

DE1: 100%|██████████| 252/252 [12:16<00:00,  2.92s/it]
DE2: 100%|██████████| 535/535 [30:26<00:00,  3.41s/it]
DE4: 100%|██████████| 233/233 [13:10<00:00,  3.39s/it]
DE7: 100%|██████████| 97/97 [04:29<00:00,  2.77s/it]
DE8: 100%|██████████| 230/230 [12:20<00:00,  3.22s/it]
DE9: 100%|██████████| 294/294 [13:17<00:00,  2.71s/it]
DEA: 100%|██████████| 219/219 [10:42<00:00,  2.93s/it]
DEB: 100%|██████████| 124/124 [05:34<00:00,  2.69s/it]
DEC: 100%|██████████| 46/46 [02:03<00:00,  2.68s/it]
DED: 100%|██████████| 282/282 [11:33<00:00,  2.46s/it]
DEE: 100%|██████████| 126/126 [06:16<00:00,  2.99s/it]
DEF: 100%|██████████| 509/509 [16:14<00:00,  1.92s/it]
DEG: 100%|██████████| 63/63 [02:39<00:00,  2.54s/it]


## Add W ~ Q correlations to metadata

The data reports contain correlations between all data variables. We can extract the pearson's correlation coefficient or the spearman rank correlation and add to the preliminary metadata file

In [4]:
util.get_metadata()

Unnamed: 0,camels_id,provider_id,camels_path,nuts_lvl2,federal_state,gauge_name,waterbody_name,gauge_elevation,area,x,y,lon,lat,q_count,w_count,q_w_pearson,q_w_spearman
0,DEG10000,573000,./DEG/DEG10000/DEG10000_data.csv,DEG,Thüringen,Ammern,Unstrut,210.243,182.700000,4.352221e+06,3.124617e+06,10.446993,51.231727,29646.0,29646.0,0.969240,0.976895
1,DEG10010,447000,./DEG/DEG10010/DEG10010_data.csv,DEG,Thüringen,Arenshausen,Leine,196.288,275.000000,4.318941e+06,3.140875e+06,9.970428,51.378709,22707.0,22707.0,0.709148,0.685209
2,DEG10020,574200,./DEG/DEG10020/DEG10020_data.csv,DEG,Thüringen,Arnstadt,Gera,293.577,174.700000,4.386764e+06,3.077926e+06,10.933022,50.809106,35490.0,35490.0,0.958767,0.962387
3,DEG10030,576500,./DEG/DEG10030/DEG10030_data.csv,DEG,Thüringen,Berga,Weiße Elster,218.995,1383.000000,4.473276e+06,3.073272e+06,12.157989,50.750857,12845.0,12845.0,0.502141,0.553354
4,DEG10040,570210,./DEG/DEG10040/DEG10040_data.csv,DEG,Thüringen,Blankenstein-Rosenthal,Saale,410.517,1013.000000,4.442190e+06,3.033884e+06,11.704738,50.404273,21246.0,21246.0,0.940139,0.977699
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2865,DE912850,4887101,./DE9/DE912850/DE912850_data.csv,DE9,Niedersachsen,Koldingen,,,4958.909671,4.307642e+06,3.240659e+06,9.804298,52.275377,0.0,12480.0,,
2866,DE912860,4948130,./DE9/DE912860/DE912860_data.csv,DE9,Niedersachsen,Tietjens Hütte,,,462.359364,4.242145e+06,3.344549e+06,8.819951,53.203393,0.0,24138.0,,
2867,DE912880,4994109,./DE9/DE912880/DE912880_data.csv,DE9,Niedersachsen,Weddewarden,,,128.718582,4.223978e+06,3.388719e+06,8.534632,53.597259,0.0,5479.0,,
2868,DE912920,9286164,./DE9/DE912920/DE912920_data.csv,DE9,Niedersachsen,Laar,,,1749.573210,4.100147e+06,3.283012e+06,6.738424,52.611652,0.0,19450.0,,


In [5]:
for NUTS in util._NUTS_LVL2_NAMES.keys():    
    # process this federal state
    with Bundesland(NUTS) as bl:    
        pearson = []
        spearman = []

        # get the metadata
        meta = bl.metadata

        # load the Data-report for each
        for camels_id in tqdm(meta.camels_id.values, desc=NUTS):
            p = os.path.join(bl.base_path, 'reports', f'{camels_id}.json')
            
            # check if the report exists
            if not os.path.exists(p):
                pearson.append(None)
                spearman.append(None)
                continue
            
            with open(p, 'r') as f:
                report = json.load(f)
            
            try:
                q = [o for o in report['correlations']['pearson'] if o['q'] == 1.0][0]
                w = [o for o in report['correlations']['pearson'] if o['w'] == 1.0][0]
                pearson.append(q['w'])
            except:
                pearson.append(None)
            
            try:
                q = [o for o in report['correlations']['spearman'] if o['q'] == 1.0][0]
                w = [o for o in report['correlations']['spearman'] if o['w'] == 1.0][0]
                spearman.append(q['w'])
            except:
                spearman.append(None)
        
        # all collected, return now
        corrs = pd.DataFrame({'camels_id': meta.camels_id.values, 'q_w_pearson': pearson, 'q_w_spearman': spearman})

        # update
        bl.update_metadata(corrs)

metadata = util.get_metadata()
metadata[metadata['nuts_lvl2'] == 'DE9']

DE1: 100%|██████████| 252/252 [00:02<00:00, 120.07it/s]
DE2: 100%|██████████| 535/535 [00:05<00:00, 90.49it/s] 
DE3: 0it [00:00, ?it/s]
DE4: 100%|██████████| 233/233 [00:02<00:00, 99.24it/s] 
DE5: 0it [00:00, ?it/s]
DE6: 0it [00:00, ?it/s]
DE7: 100%|██████████| 97/97 [00:01<00:00, 78.34it/s]
DE8: 100%|██████████| 229/229 [00:01<00:00, 127.69it/s]
DE9: 100%|██████████| 261/261 [00:02<00:00, 110.55it/s]
DEA: 100%|██████████| 219/219 [00:04<00:00, 50.69it/s]
DEB: 100%|██████████| 124/124 [00:01<00:00, 87.07it/s]
DEC: 100%|██████████| 46/46 [00:00<00:00, 88.88it/s]
DED: 100%|██████████| 178/178 [00:01<00:00, 108.17it/s]
DEE: 100%|██████████| 126/126 [00:01<00:00, 67.95it/s]
DEF: 100%|██████████| 507/507 [00:03<00:00, 168.87it/s]
DEG: 100%|██████████| 63/63 [00:01<00:00, 59.88it/s]


Unnamed: 0,camels_id,provider_id,camels_path,nuts_lvl2,federal_state,gauge_name,waterbody_name,gauge_elevation,area,x,y,lon,lat,q_count,w_count,q_w_pearson,q_w_spearman
2609,DE910000,3183101,./DE9/DE910000/DE910000_data.csv,DE9,Niedersachsen,Sudendorf,,,121.560123,4.183280e+06,3.217135e+06,7.992302,52.047079,12845.0,12845.0,0.921013,0.928097
2610,DE910010,3346103,./DE9/DE910010/DE910010_data.csv,DE9,Niedersachsen,Schwege,,,47.371025,4.178458e+06,3.218442e+06,7.921498,52.057597,12845.0,12845.0,0.828968,0.816012
2611,DE910020,3437108,./DE9/DE910020/DE910020_data.csv,DE9,Niedersachsen,Beesten,,,407.147698,4.150721e+06,3.261657e+06,7.495577,52.437915,5966.0,5966.0,0.850115,0.637185
2612,DE910040,3449100,./DE9/DE910040/DE910040_data.csv,DE9,Niedersachsen,Spelle,,,149.735031,4.155177e+06,3.254250e+06,7.564736,52.372753,4597.0,4597.0,0.824894,0.715392
2613,DE910050,3449103,./DE9/DE910050/DE910050_data.csv,DE9,Niedersachsen,Hesselte,,,369.698771,4.142822e+06,3.259095e+06,7.380863,52.412404,22341.0,22341.0,0.925844,0.870071
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2865,DE912850,4887101,./DE9/DE912850/DE912850_data.csv,DE9,Niedersachsen,Koldingen,,,4958.909671,4.307642e+06,3.240659e+06,9.804298,52.275377,0.0,12480.0,,
2866,DE912860,4948130,./DE9/DE912860/DE912860_data.csv,DE9,Niedersachsen,Tietjens Hütte,,,462.359364,4.242145e+06,3.344549e+06,8.819951,53.203393,0.0,24138.0,,
2867,DE912880,4994109,./DE9/DE912880/DE912880_data.csv,DE9,Niedersachsen,Weddewarden,,,128.718582,4.223978e+06,3.388719e+06,8.534632,53.597259,0.0,5479.0,,
2868,DE912920,9286164,./DE9/DE912920/DE912920_data.csv,DE9,Niedersachsen,Laar,,,1749.573210,4.100147e+06,3.283012e+06,6.738424,52.611652,0.0,19450.0,,
