# Data Reports

use pandas-profiling to generate data reports.  
Also adds W~Q correlations to metadata.csv

In [1]:
import os
from tqdm import tqdm
import json
import warnings
from camelsp import Bundesland, util
import pandas as pd

First load all NUTS folders found in the output dir

In [2]:
nuts = sorted([nt for nt in os.listdir(Bundesland('Hessen').base_path) if nt.startswith('DE')])
nuts

['DE1',
 'DE2',
 'DE4',
 'DE7',
 'DE8',
 'DE9',
 'DEA',
 'DEB',
 'DEC',
 'DED',
 'DEE',
 'DEF',
 'DEG']

Und jetzt gib ihm.

In [3]:
# set to true, if new output data was added (ie. rainfall)
REPLACE = False

# create for each report
for ID in nuts:
    with Bundesland(ID) as bl:
        with warnings.catch_warnings(record=True) as warn:
            nuts_ids = bl.nuts_table.nuts_id.values.tolist()

            for nuts_id in tqdm(nuts_ids, desc=ID):
                # write the report files
                try:
                    bl.generate_reports(nuts_ids=nuts_id, fmt='html', if_exists='replace' if REPLACE else 'omit')
                    bl.generate_reports(nuts_ids=nuts_id, fmt='json', if_exists='replace' if REPLACE else 'omit')
                except Exception as e:
                    print(str(e))
                    warnings.warn(str(e))

            if len(warn) > 0:
                print(f"FutureWarnings: {len([w for w in warn if w.category == FutureWarning])}")
                print(f"Other warnings: {len([w for w in warn if w.category != FutureWarning])} (possibly missing data files).")

DE1: 100%|██████████| 252/252 [12:16<00:00,  2.92s/it]
DE2: 100%|██████████| 535/535 [30:26<00:00,  3.41s/it]
DE4: 100%|██████████| 233/233 [13:10<00:00,  3.39s/it]
DE7: 100%|██████████| 97/97 [04:29<00:00,  2.77s/it]
DE8: 100%|██████████| 230/230 [12:20<00:00,  3.22s/it]
DE9: 100%|██████████| 294/294 [13:17<00:00,  2.71s/it]
DEA: 100%|██████████| 219/219 [10:42<00:00,  2.93s/it]
DEB: 100%|██████████| 124/124 [05:34<00:00,  2.69s/it]
DEC: 100%|██████████| 46/46 [02:03<00:00,  2.68s/it]
DED: 100%|██████████| 282/282 [11:33<00:00,  2.46s/it]
DEE: 100%|██████████| 126/126 [06:16<00:00,  2.99s/it]
DEF: 100%|██████████| 509/509 [16:14<00:00,  1.92s/it]
DEG: 100%|██████████| 63/63 [02:39<00:00,  2.54s/it]


## Add W ~ Q correlations to metadata

The data reports contain correlations between all data variables. We can extract the pearson's correlation coefficient or the spearman rank correlation and add to the preliminary metadata file

In [3]:
util.get_metadata()

Unnamed: 0,camels_id,provider_id,camels_path,nuts_lvl2,federal_state,gauge_name,waterbody_name,gauge_elevation,area,x,y,lon,lat,q_count,w_count,q_extent_years,w_extent_years
0,DEG10000,573000,./DEG/DEG10000/DEG10000_data.csv,DEG,Thüringen,Ammern,Unstrut,210.243,182.700,4.352221e+06,3.124617e+06,10.446993,51.231727,29646.0,29646.0,81.219178,32.186301
1,DEG10010,447000,./DEG/DEG10010/DEG10010_data.csv,DEG,Thüringen,Arenshausen,Leine,196.288,275.000,4.318941e+06,3.140875e+06,9.970428,51.378709,22707.0,22707.0,62.208219,59.876712
2,DEG10020,574200,./DEG/DEG10020/DEG10020_data.csv,DEG,Thüringen,Arnstadt,Gera,293.577,174.700,4.386764e+06,3.077926e+06,10.933022,50.809106,35490.0,35490.0,97.230137,32.186301
3,DEG10030,576500,./DEG/DEG10030/DEG10030_data.csv,DEG,Thüringen,Berga,Weiße Elster,218.995,1383.000,4.473276e+06,3.073272e+06,12.157989,50.750857,12845.0,12845.0,31.186301,35.189041
4,DEG10040,570210,./DEG/DEG10040/DEG10040_data.csv,DEG,Thüringen,Blankenstein-Rosenthal,Saale,410.517,1013.000,4.442190e+06,3.033884e+06,11.704738,50.404273,21246.0,21246.0,58.205479,52.032877
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2865,DE112470,76273,./DE1/DE112470/DE112470_data.csv,DE1,Baden-Württemberg,Blaubeuren,Blautopf,511.871,0.067,4.304981e+06,2.811435e+06,9.783683,48.416174,30987.0,30987.0,65.380822,79.219178
2866,DE112480,76274,./DE1/DE112480/DE112480_data.csv,DE1,Baden-Württemberg,Erlenbach,Sulm,160.832,101.510,4.267341e+06,2.895499e+06,9.264333,49.170156,4140.0,4140.0,11.339726,11.339726
2867,DE112490,76276,./DE1/DE112490/DE112490_data.csv,DE1,Baden-Württemberg,Bolheim,Brenz,473.000,339.811,4.332390e+06,2.835208e+06,10.154468,48.630150,5555.0,5555.0,15.175342,15.216438
2868,DE112500,76290,./DE1/DE112500/DE112500_data.csv,DE1,Baden-Württemberg,Schweinhausen,Riß,541.098,101.589,4.305130e+06,2.769259e+06,9.787300,48.036705,8342.0,8342.0,22.852055,22.852055


In [4]:
for NUTS in util._NUTS_LVL2_NAMES.keys():    
    # process this federal state
    with Bundesland(NUTS) as bl:    
        pearson = []
        spearman = []

        # get the metadata
        meta = bl.metadata

        # load the Data-report for each
        for camels_id in tqdm(meta.camels_id.values, desc=NUTS):
            p = os.path.join(bl.base_path, 'reports', f'{camels_id}.json')
            
            # check if the report exists
            if not os.path.exists(p):
                pearson.append(None)
                spearman.append(None)
                continue
            
            with open(p, 'r') as f:
                report = json.load(f)
            
            try:
                q = [o for o in report['correlations']['pearson'] if o['q'] == 1.0][0]
                w = [o for o in report['correlations']['pearson'] if o['w'] == 1.0][0]
                pearson.append(q['w'])
            except:
                pearson.append(None)
            
            try:
                q = [o for o in report['correlations']['spearman'] if o['q'] == 1.0][0]
                w = [o for o in report['correlations']['spearman'] if o['w'] == 1.0][0]
                spearman.append(q['w'])
            except:
                spearman.append(None)
        
        # all collected, return now
        corrs = pd.DataFrame({'camels_id': meta.camels_id.values, 'q_w_pearson': pearson, 'q_w_spearman': spearman})

        # update
        bl.update_metadata(corrs)

metadata = util.get_metadata()
metadata[metadata['nuts_lvl2'] == 'DE9']

DE1:   0%|          | 0/252 [00:00<?, ?it/s]

DE1: 100%|██████████| 252/252 [00:02<00:00, 120.02it/s]
DE2: 100%|██████████| 535/535 [00:05<00:00, 95.30it/s] 
DE3: 0it [00:00, ?it/s]
DE4: 100%|██████████| 233/233 [00:01<00:00, 139.64it/s]
DE5: 0it [00:00, ?it/s]
DE6: 0it [00:00, ?it/s]
DE7: 100%|██████████| 97/97 [00:00<00:00, 105.38it/s]
DE8: 100%|██████████| 229/229 [00:01<00:00, 195.53it/s]
DE9: 100%|██████████| 261/261 [00:01<00:00, 134.15it/s]
DEA: 100%|██████████| 219/219 [00:04<00:00, 51.84it/s]
DEB: 100%|██████████| 124/124 [00:01<00:00, 86.40it/s]
DEC: 100%|██████████| 46/46 [00:00<00:00, 87.60it/s]
DED: 100%|██████████| 178/178 [00:01<00:00, 100.33it/s]
DEE: 100%|██████████| 126/126 [00:03<00:00, 39.95it/s]
DEF: 100%|██████████| 507/507 [00:02<00:00, 172.63it/s]
DEG: 100%|██████████| 63/63 [00:01<00:00, 61.95it/s]


Unnamed: 0,camels_id,provider_id,camels_path,nuts_lvl2,federal_state,gauge_name,waterbody_name,gauge_elevation,area,x,y,lon,lat,q_count,w_count,q_extent_years,w_extent_years,q_w_pearson,q_w_spearman
1263,DE910000,3183101,./DE9/DE910000/DE910000_data.csv,DE9,Niedersachsen,Sudendorf,,,121.560123,4.183280e+06,3.217135e+06,7.992302,52.047079,12845.0,12845.0,35.189041,35.189041,0.921013,0.928097
1264,DE910010,3346103,./DE9/DE910010/DE910010_data.csv,DE9,Niedersachsen,Schwege,,,47.371025,4.178458e+06,3.218442e+06,7.921498,52.057597,12845.0,12845.0,35.189041,35.189041,0.828968,0.816012
1265,DE910020,3437108,./DE9/DE910020/DE910020_data.csv,DE9,Niedersachsen,Beesten,,,407.147698,4.150721e+06,3.261657e+06,7.495577,52.437915,5966.0,5966.0,16.342466,16.342466,0.850115,0.637185
1266,DE910040,3449100,./DE9/DE910040/DE910040_data.csv,DE9,Niedersachsen,Spelle,,,149.735031,4.155177e+06,3.254250e+06,7.564736,52.372753,4597.0,4597.0,12.591781,12.591781,0.824894,0.715392
1267,DE910050,3449103,./DE9/DE910050/DE910050_data.csv,DE9,Niedersachsen,Hesselte,,,369.698771,4.142822e+06,3.259095e+06,7.380863,52.412404,22341.0,22341.0,61.205479,55.202740,0.925844,0.870071
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1519,DE912850,4887101,./DE9/DE912850/DE912850_data.csv,DE9,Niedersachsen,Koldingen,,,4958.909671,4.307642e+06,3.240659e+06,9.804298,52.275377,0.0,12480.0,,34.189041,,
1520,DE912860,4948130,./DE9/DE912860/DE912860_data.csv,DE9,Niedersachsen,Tietjens Hütte,,,462.359364,4.242145e+06,3.344549e+06,8.819951,53.203393,0.0,24138.0,,66.128767,,
1521,DE912880,4994109,./DE9/DE912880/DE912880_data.csv,DE9,Niedersachsen,Weddewarden,,,128.718582,4.223978e+06,3.388719e+06,8.534632,53.597259,0.0,5479.0,,15.008219,,
1522,DE912920,9286164,./DE9/DE912920/DE912920_data.csv,DE9,Niedersachsen,Laar,,,1749.573210,4.100147e+06,3.283012e+06,6.738424,52.611652,0.0,19450.0,,53.284932,,
