## methods

The stations are "snapped" to the cell with the best fit to upstream area and distance with a maximum distance of 10km and maximum relative error in upstream area (compared to the GRDC reported upstream area) of 50%. 

The best fit was found by minimizing the weighted average of the combined relative distance error (RD; 0 at original location to 1 at max distance) and relative upstream area error (RA; 0 at reported upstream area to 1 at maximum relative upstream area error) according to 

$R = \frac{a * RD + b * RA}{a + b}$

The procedure is similar as reported by Do et al. 2017 (GSIM database) and  Lehner 2012 (GRDC report number 41). 

In addition, for cases where no pixels are found that match the relative upstream area error criterion (~3% of the stations), a pixel between the 99.5-100 percentile upstream area was selected, resulting in relative upstream area errors larger than 1. 

In [None]:
import numpy as np
from cmftools import snap_stations, cmf_index
import gistools
from os.path import join, basename
import json
import geopandas as gp
import pandas as pd
%matplotlib inline

In [None]:
# grdc data
fn_grdc_meta = r'/home/dirk/datasets/GRDC/20170124_GRDC_Stations_meta.csv'
#snap settings
max_dist=10e3
upa_relerr=0.5
upa_weigth=1
dist_weight=2
kwargs = dict(max_dist=max_dist, upa_relerr=upa_relerr, upa_weigth=upa_weigth, dist_weight=dist_weight, local_upa_perc=99.5)

In [None]:
# maps old v361
# map_dir = r'/home/dirk/Models/CaMa-Flood_v3.6.2_20140909/map/global_15min/'
# uparea_fn = r'/home/dirk/Datasets/sheds_0.005_140807/upa.vrt'
# fn_lonlat = join(map_dir, r'lonlat.tif')
# fn_catmxy = join(map_dir, r'hires', r'catmxy.vrt')
# out_fn = join(map_dir, basename(fn_grdc_meta).replace('.csv', '_snap.csv'))


In [None]:
# maps new v391
map_dir = r'/home/dirk/models/CaMa_v392_20180727/map/glb_0.25d/'
uparea_fn = join(map_dir, '15sec', 'uparea.vrt')
fn_lonlat = join(map_dir, r'lonlat.tif')
fn_catmxy = join(map_dir, r'15sec', r'catmxy.vrt')
out_fn = join(map_dir, basename(fn_grdc_meta).replace('.csv', '_{}_snap_{:d}dist{:.0e}_{:d}upa{:.1e}.csv'))
out_fn = out_fn.format('v392', dist_weight, max_dist, upa_weigth, upa_relerr)
print(out_fn)

### 1) load GRDC data

In [None]:
grdc_meta = pd.read_csv(fn_grdc_meta, index_col=0, encoding = "ISO-8859-1")
grdc_meta = grdc_meta.rename({'lat': 'lat_org', 'long':'lon_org'}, axis=1)
nstations = len(grdc_meta)
grdc_meta = grdc_meta[np.logical_and.reduce((grdc_meta['area']>0, grdc_meta['lon_org']<180, grdc_meta['lat_org']<90))] 
ninvalid = nstations - len(grdc_meta)
print('total stations {}; stations with invalid meta data {:d}'.format(nstations, ninvalid))

grdc_meta_gdf = gistools.pandas2geopandas(grdc_meta, x_col='lon_org', y_col='lat_org', crs={'init':'epsg:4326'})

In [None]:
# import matplotlib.colors as mc
# from gistools.plot_tools import basemap, plot_choropleth, cmap3, cmap1

# fig, axes = basemap(coastlines=True, glob=True)
# ax =axes[0]
# t = ax.set_title('GRDC stations (n={:d})'.format(len(grdc_meta)))
# plot_kwargs = dict(markersize=8, )
# plot_choropleth(fig, ax, grdc_meta_gdf.sort_values('area', ascending=True), 'area', 
#                    clabel='upstream area [km2]', norm=mc.LogNorm,
#                    plot_kwargs=plot_kwargs)

### 2a) snap - example

An example of the procedure for a good fit at *OBIDOS - PORTO, AMAZONAS (3629000)* and a bad fit at *GREAT KEI RIVER (1160684)* are shown below

In [None]:
from cmftools import snap_gauge, plot_snap, read_gtiff_buf
uparea_col='area'
max_dist = kwargs['max_dist']

gauge_id = 6435060
gauge = grdc_meta_gdf.loc[gauge_id, :]
gauge_xy = gauge.geometry.coords[:][0]
gauge_uparea = gauge[uparea_col]

# uparea_2d, transform = read_gtiff_buf(uparea_fn, gauge_xy, buf=max_dist, layer=0)
snap, uparea_2d, transform, success = snap_gauge(uparea_fn, gauge_xy, gauge_uparea, **kwargs)
fig, _ = plot_snap(dict(snapped=snap['combi']), gauge_id, gauge_xy, gauge_uparea, uparea_2d, transform, **kwargs)

# gauge_id = 1160684
# gauge = grdc_meta_gdf.loc[gauge_id, :]
# gauge_xy = gauge.geometry.coords[:][0]
# gauge_uparea = gauge[uparea_col]

# snap, uparea_2d, transform, success = snap_gauge(uparea_fn, gauge_xy, gauge_uparea, **kwargs)
# fig, _ = plot_snap(dict(snapped=snap['combi']), gauge_id, gauge_xy, gauge_uparea, uparea_2d, transform, **kwargs)

### 2b) snap - all

In [None]:
# snap data and save intermediate result to csv
snap_df = snap_stations(grdc_meta_gdf, uparea_fn, fig_dir=None, uparea_col='area', **kwargs)
# lookup catmxy
snap_df = cmf_index(snap_df, fn_catmxy, fn_lonlat, x_col='lon', y_col='lat')
# save to csv
snap_df = pd.concat([grdc_meta, snap_df], axis=1)
snap_df.to_csv(out_fn)

In [None]:
snap_df.index.name = 'grdc_id'
snap_df = snap_df.drop(columns=['wmo_reg','sub_reg','mix_reg','nat_id','t_start','t_end','t_yrs','r_volume_yr','r_height_yr','proc_tyrs','proc_tmon','f_import','f_im_yr','l_import','l_im_yr','provider_id','ACSYS','statmouth','GEMS','GCOS_GTNH','ltchars','Pristine Basins','GRDC Reference Dataset','AdaptAlp','ArctHycos','EWA','BALTEX'])
snap_df.to_csv(out_fn)

### 3) analyze results

I’ve tried different values for the distance and upstream area weight and selected R = (2 * RD + RA) / 3 to penalize pixels that are further away from the reported location. CDFs of resulting relative errors are shown below. About 3% of the stations does not meet the upstream area criterions.

In [None]:
# fn = join(map_dir, r'20170124_GRDC_Stations_meta_snap_1dist1e+04_2upa5.0e-01.csv')
# df_1RD_2RA = pd.read_csv(fn, index_col=0)
fn = join(map_dir, r'20170124_GRDC_Stations_meta_v392_snap_2dist1e+04_1upa5.0e-01.csv')
df_2RD_1RA = pd.read_csv(fn, index_col=0)
# fn = join(map_dir, r'20170124_GRDC_Stations_meta_snap_1dist1e+04_1upa5.0e-01.csv')
# df_1RD_1RA = pd.read_csv(fn, index_col=0)

In [None]:
# top 5 stations with worst relative error.
df_2RD_1RA.sort_values('combi_relerr', ascending=False).head()

In [None]:
snap_gdf = gistools.pandas2geopandas(df_2RD_1RA, x_col='lon', y_col='lat', crs={'init':'epsg:4326'})
fig, axes = basemap(coastlines=True, glob=True)
ax =axes[0]
t = ax.set_title('GRDC stations (n={:d}) - weighted combined rel. error - 2RD + 1RA'.format(len(grdc_meta)))
plot_kwargs = dict(markersize=4, vmin=0, vmax=2)
plot_choropleth(fig, ax, snap_gdf.sort_values('combi_relerr'), 'combi_relerr', 
                   clabel='rel errer [-]', plot_kwargs=plot_kwargs, cmap=gistools.cmap4)

snap_gdf = gistools.pandas2geopandas(df_1RD_2RA, x_col='lon', y_col='lat', crs={'init':'epsg:4326'})
fig, axes = basemap(coastlines=True, glob=True)
ax =axes[0]
t = ax.set_title('GRDC stations (n={:d}) - weighted combined rel. error - 1RD + 2RA'.format(len(grdc_meta)))
plot_kwargs = dict(markersize=4, vmin=0, vmax=2)
plot_choropleth(fig, ax, snap_gdf.sort_values('combi_relerr'), 'combi_relerr', 
                   clabel='rel errer [-]', plot_kwargs=plot_kwargs, cmap=gistools.cmap4)

snap_gdf = gistools.pandas2geopandas(df_1RD_1RA, x_col='lon', y_col='lat', crs={'init':'epsg:4326'})
fig, axes = basemap(coastlines=True, glob=True)
ax =axes[0]
t = ax.set_title('GRDC stations (n={:d}) - weighted combined rel. error - 1RD + 1RA'.format(len(grdc_meta)))
plot_kwargs = dict(markersize=4, vmin=0, vmax=2)
plot_choropleth(fig, ax, snap_gdf.sort_values('combi_relerr'), 'combi_relerr', 
                   clabel='rel errer [-]', plot_kwargs=plot_kwargs, cmap=gistools.cmap4)

In [None]:
import statsmodels.api as sm 

def ecdf(values, bins=None):
    values = np.asarray(values)
    if bins is None:
        x = np.linspace(values.min(), values.max())
    else:
        x = np.asarray(bins)
    y = sm.distributions.ECDF(values)(x)
    return x, y

In [None]:
import matplotlib.pyplot as plt
bins = np.arange(0, 1.2, 0.001)
pkwargs=dict(linewidth=1.5)

fig, (ax1, ax2, ax3) = plt.subplots(1,3, figsize=(14,5))

x, y = ecdf(df_1RD_2RA['upa_relerr'].values, bins=bins)
print((1-y[x==1])*100)
ax1.step(x, y, 'b', label='1RD_2RA', **pkwargs)
x, y = ecdf(df_2RD_1RA['upa_relerr'].values, bins=bins)
ax1.step(x, y, 'g', label='2RD_1RA', **pkwargs)
x, y = ecdf(df_1RD_1RA['upa_relerr'].values, bins=bins)
ax1.step(x, y, 'm', label='1RD_1RA', **pkwargs)
ax1.axvline(x=1, color='k', linestyle='--', linewidth=1)

x, y = ecdf(df_1RD_2RA['dist_relerr'].values, bins=bins)
ax2.step(x, y, 'b', label='1RD_2RA', **pkwargs)
x, y = ecdf(df_2RD_1RA['dist_relerr'].values, bins=bins)
ax2.step(x, y, 'g', label='2RD_1RA', **pkwargs)
x, y = ecdf(df_1RD_1RA['dist_relerr'].values, bins=bins)
ax2.step(x, y, 'm', label='1RD_1RA', **pkwargs)
ax2.axvline(x=1, color='k', linestyle='--', linewidth=1)

x, y = ecdf(df_1RD_2RA['combi_relerr'].values, bins=bins)
ax3.step(x, y, 'b', label='1RD_2RA', **pkwargs)
x, y = ecdf(df_2RD_1RA['combi_relerr'].values, bins=bins)
ax3.step(x, y, 'g', label='2RD_1RA', **pkwargs)
x, y = ecdf(df_1RD_1RA['combi_relerr'].values, bins=bins)
ax3.step(x, y, 'm', label='1RD_1RA', **pkwargs)
ax3.axvline(x=1, color='k', linestyle='--', linewidth=1)


ax1.set_xlim([0, 1.2])
ax1.set_ylim([0, 1])
ax1.legend()
ax1.set_xlabel('upa rel. error')
ax1.set_ylabel('CDF')
ax1.set_title('upa rel. error')
ax2.set_xlim([0, 1.2])
ax2.set_ylim([0, 1])
ax2.legend()
ax2.set_xlabel('dist rel. error')
ax2.set_title('dist rel. error')
ax3.set_xlim([0, 1.2])
ax3.set_ylim([0, 1])
ax3.set_xlabel('comb. rel. error')
ax3.set_title('comb. rel. error')
ax3.legend()