In [None]:
import xarray as xr
from os.path import join
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from stats_eva import get_peaks

## read and align data

In [None]:
# get data, rasample to daily values and align
ddir = r'../01_forcing'
# daq = xr.open_dataset(join(ddir, 'cama_discharge_beira_daily.nc'), chunks={'time':30})['discharge']
daq = xr.open_dataset(join(ddir, 'glofas_discharge_beira.nc'), chunks={'time':30})['discharge']
dsh_hr = xr.open_dataset(join(ddir, 'reanalysis_gtsm_v1_beira_extended.nc'), chunks={'time':6*24*100})#.drop_vars('waterlevel')
# dsh_hr['surge'] = np.maximum(0, dsh_hr['surge'])
dsh_hr['tide'] = dsh_hr['waterlevel'] - dsh_hr['surge']
dsh_hr['sw'] = dsh_hr['surge'] + dsh_hr['shww']*0.2
dsh_hr['waterlevel_tsw'] = dsh_hr['tide'] + dsh_hr['sw']
dsh_hr = dsh_hr.rename({'surge': 's', 'shww': 'w', 'tide': 't'})
dap_hr = xr.open_dataset(join(ddir, 'era5_precip_beira_hourly_spatialmean.nc'), chunks={'time':24*30})['precip']

In [None]:
dsh = dsh_hr.resample(time='1D', label='right').max('time')
dap = dap_hr.resample(time='1D', label='right').sum('time')
dates = pd.date_range('19800101', '20210201', freq='D')
ds = xr.merge([
    daq.sel(index=1).rename('qb'),
    daq.sel(index=4).rename('qp'),
    dsh[['s', 't', 'w', 'sw']],
    dsh['waterlevel'].rename('h'),
    dsh['waterlevel_tsw'].rename('h_tsw'),
    dap.rename('p')
], compat='override').reset_coords(drop=True).reindex(time=dates)
ds.attrs = {}
ds.to_netcdf(join(ddir, 'beira_drivers_daily2_glofas.nc'))

In [None]:
ddir = r'../01_forcing'
ds = xr.open_dataset(join(ddir, 'beira_drivers_daily1_glofas.nc'))
# ds = ds.drop_vars(['h']).rename({'h_tsw': 'h'})
# ds = ds.drop_vars(['w', 'sw', 'h_tsw'])
# ds = ds.sel(time=slice('19800101', '20181231'))

In [None]:
labels = {
    'qb': 'Discharge Buzi\n[m3/s]',
    'qp': 'Discharge Pungwe\n[m3/s]',
    'p': 'Rainfall\n[mm]',
    's': 'Surge\n[m]',
    'ss': 'Skew surge\n[m]',
    't': 'Tide\n[m+MSL]',
    'w': 'Sign. wave height\n[m]',
    'h': 'Total waterlevel\n[m+MSL]',
    'sw': 'Non-tidal residual\n[m]'
}

## get annual maxima peaks

In [None]:
# settings
period='AS-AUG'

ds_peaks = xr.Dataset(coords=ds.coords)
for dvar in ds.data_vars.keys():
    if dvar == 't': continue
    ds_peaks[dvar] = get_peaks(ds[dvar], period=period, min_dist=14, min_sample_size=90)
ds_peaks = ds_peaks.reset_coords(drop=True).dropna('time', how='all')

# peaks with dates
df_peaks0 = ds_peaks.to_dataframe()  

# get maximum values within time window for non_extremes
df_peaks0_filled = pd.DataFrame()
ds_tmax = ds.rolling(time=7).max('time').sel(time=df_peaks0.index)
# ds_tmax = ds.sel(time=df_peaks0.index)
for dvar in df_peaks0.columns:
    df_peaks0_filled[dvar] = df_peaks0[dvar].where(df_peaks0[dvar].notna(), ds_tmax[dvar])

# peaks with regular spaced interval
df_bm = ds_peaks.resample(time=period).max('time').to_dataframe().dropna()
n = len(ds_peaks.data_vars)


In [None]:
fig, axes = plt.subplots(n, 1, figsize=(12, 3*n), sharex=True)
for i, dvar in enumerate(ds_peaks.data_vars.keys()):
    ds[dvar].to_series().plot(ax=axes[i], color='k')
    df_peaks0[dvar].plot(ax=axes[i], color='r', marker='.', lw=0)
    axes[i].set_ylabel(labels[dvar])

## fit eva

In [None]:
from stats_eva import lmoment_fitopt, get_frozen_dist, plot_return_values
dparams = ['shape', 'loc', 'scale']
distributions = ['gev', 'gumb']#[1:]

fig, axes = plt.subplots(n, 1, figsize=(8, 3*n), sharex=True)

df_eva = pd.DataFrame(columns=['dist'] + dparams)
for i, dvar in enumerate(ds_peaks.data_vars.keys()):
    x = ds_peaks[dvar].dropna('time').values
    params, dist = lmoment_fitopt(x, distributions=distributions, criterium='AIC')
    df_eva.loc[dvar, dparams[-len(params):]] = params
    df_eva.loc[dvar, 'dist'] = dist
    _ = plot_return_values(x, params, dist, ax=axes[i])
    axes[i].set_title(dvar)
    axes[i].set_ylim([x.min()*0.9, axes[i].get_ylim()[1]])
    axes[i].set_ylabel(labels[dvar])
    if i < n-1:
        axes[i].set_xlabel('')
df_eva

## co-occurence

In [None]:
# NOTE that `max_time_lag` is the timelag between two consecutive extremes
# the total time lag between all four events could theoretically be n times as long
## settings
drivers = ['qb', 'qp', 'p', 's']
max_time_lag = {
    'qp': pd.Timedelta('5D'),
    'qb': pd.Timedelta('5D'),
    'p': pd.Timedelta('1D'),
    's': pd.Timedelta('1D'),
    'w': pd.Timedelta('1D'),
    'h': pd.Timedelta('1D'),
    'sw': pd.Timedelta('1D'),
}

# reduce daily events to selected drivers
df_peaks = df_peaks0[drivers].dropna(how='all')

# combine daily events to event sets based on maximum time_lag
df_peaks['dt'] = np.hstack([[0], np.diff(df_peaks.index)])
df_peaks['max_dt'] = (df_peaks[drivers].notna()*np.array([max_time_lag[d] for d in drivers])).max(axis=1)
# df_peaks['max_dt'] = [pd.Timedelta(days=d) for d in df_peaks['max_dt'].dt.days.rolling(2, min_periods=1).max().values]
df_peaks['event'] = np.cumsum(~(df_peaks['dt']<df_peaks['max_dt']))
df_events = df_peaks.groupby('event').apply(lambda x: x.notna().sum()).drop(columns=['dt', 'event'])
df_events['n_extreme'] = df_events[drivers].sum(axis=1)
df_events['time'] = df_peaks.reset_index().groupby('event').first()['time']
offset = pd.Timedelta(days=df_bm.index.dayofyear.min()-1)
df_events['year'] = (df_events['time'] - offset).dt.year
df_events['time_lag'] = [evnt.iloc[1:,:]['dt'].sum() for _, evnt in df_peaks.groupby('event')]
evnts, cnts = np.unique(df_peaks['event'], return_counts=True)
df_peaks['co-occur'] = np.isin(df_peaks['event'], evnts[cnts>1])

# make barcode plot for co-occuring events
data = df_events[drivers].T * df_events['n_extreme']
_, ax = plt.subplots(1,1, figsize=(12,5))
ax.imshow(data, interpolation='none', cmap='Blues', vmin=0, vmax=len(drivers))
ax.set_aspect(2)
ax.set_yticks(np.arange(len(drivers)))
ax.set_yticklabels([labels[dvar].split('\n')[0] for dvar in drivers])

xlabs = df_events.reset_index().groupby('year').first()[['event']]
_ = ax.set_xticks(xlabs['event'].values[::2]-1.5)
_ = ax.set_xticklabels(xlabs.index.values[::2], rotation=45)

# remove incomplete years
check = df_events[['n_extreme', 'year']].groupby('year').sum()
drop_years = check[check['n_extreme']!=len(drivers)].index.values
if drop_years.size > 1:
    df_events = df_events[~np.isin(df_events['year'], drop_years)]
    print(f'ignore years with more/less events than drivers: {drop_years}')
print(f'no. of valid years: {np.unique(df_events.year).size}')
print(f'no. of events: {df_events.index.size}; (>1 extreme: {df_events[df_events.n_extreme>1].index.size})')
print(f'max time lag: {df_events.time_lag.max().days} days')

In [None]:
df_events_dist = df_events.groupby(drivers + ['n_extreme']).size().reset_index(name='count')
df_events_dist = df_events_dist.sort_values('count', ascending=False)


fig, (ax1, ax) = plt.subplots(2,1, figsize=(7,3), sharex=True, gridspec_kw={'height_ratios': [2, 1], 'hspace':0.0})
df_events_dist.reset_index()['count'].plot.bar(ax=ax1, color='k')
ax1.set_ylabel('count [-]')

x, y = np.where(df_events_dist[drivers])
ax.scatter(x, y, color='k')
ax.set_ylim([-0.5, len(drivers)-0.5])
ax.set_yticks(np.arange(len(drivers)))
ax.set_yticklabels([labels[dvar].split('\n')[0] for dvar in drivers])
ax.set_xlabel('event type')
ax.set_xticklabels('')

plt.tight_layout()


## multivariate dependence modelling

In [None]:
import pyvinecopulib as pv

# Transform copula data using the empirical distribution
df_uobs = pd.DataFrame(columns=drivers, data = pv.to_pseudo_obs(df_bm[drivers].values))

# fit copula
controls = pv.FitControlsVinecop(
    family_set=[pv.BicopFamily(x) for x in np.arange(0,11)],
    # parametric_method='itau',
    # nonparametric_method='quadratic'
)
cop = pv.Vinecop(data=df_uobs.values, controls=controls, structure=pv.RVineStructure(len(drivers)))
# cop.structure

In [None]:
from scipy.stats import kendalltau, pearsonr

data = []
m = cop.matrix
n = m.shape[0]
v = drivers
for t in range(n-1):
    # print(f'** Tree: {t:d}')
    for e in range(n-1-t):
        p1, p2 = v[int(m[n-1-e,e]-1)], v[int(m[t,e]-1)]
        px = [v[int(p-1)] for p in m[:t,e]]
        c = cop.get_pair_copula(t,e)
        tau0, ptau = kendalltau(df_uobs[p1], df_uobs[p2])
        tau = cop.get_tau(t,e)  # NOTE: diffferent from scipy.stats method ?
        pxs = f' | ' + ','.join(px) if px else ''
        cstr = c.str().replace(f'\n',',')
        print(f'{p1},{p2}{pxs}: {cstr}; tau = {tau:.5f} ({tau0:.3f}, {ptau:.3f})')
        data.append([t, e, [p1,p2], px, c.str().split(',')[0], c.parameters.flatten(), tau])

df_cop = pd.DataFrame(
    data=data,
    columns=['tree', 'edge', 'pair', 'conditional', 'copula', 'parameters', 'tau']
)
df_cop.set_index(['tree', 'edge'])


In [None]:
from scipy.stats import percentileofscore
seeds = np.arange(len(drivers), dtype=int)

n = len(drivers)-1
fig, axes = plt.subplots(n,n, sharex=False, sharey=False, figsize=(8,8), gridspec_kw={'hspace':0.0, 'wspace':0.0})

# df_uobs_cooc = df_uobs.where(df_peaks[drivers][df_peaks['co-occur']].resample(period).max().reset_index(drop=True).notna())
df_obs_cooc = df_peaks[drivers][df_peaks['co-occur']].resample(period).max()

df_usim = pd.DataFrame(data=cop.simulate(10000, seeds=seeds), columns=drivers)
df_sim = pd.DataFrame(columns=drivers)
for dvar in drivers:
    df_sim[dvar] = np.quantile(df_bm[dvar], df_usim[dvar])


for r in range(n):
    for c in range(n):
        ax = axes[r,c]
        if c > r:
            ax.set_visible(False)
            continue
        xlab, ylab = drivers[c], drivers[r+1]
        x, y = df_bm[xlab].values, df_bm[ylab].values
        xmin, xmax = np.quantile(x, [0,0.95])
        ymin, ymax = np.quantile(y, [0,0.95])

        c0 = df_cop[
            [
                ([xlab, ylab] == p0 or [ylab, xlab] == p0) 
                for p0 in df_cop['pair'].values
            ]
        ].squeeze()
        

        if len(c0) > 0 and c0.copula != 'Independence':
            xx, yy = np.mgrid[xmin:xmax:100j, ymin:ymax:100j]
            positions = np.vstack([xx.ravel(), yy.ravel()])
            # TODO: use copula pdf in uspace instead of
            # bicop = cop.get_pair_copula(c0.tree,c0.edge)
            # u = np.stack([
            #     np.vectorize(lambda x: percentileofscore(df_bm[xlab], x))(xx.ravel()),
            #     np.vectorize(lambda x: percentileofscore(df_bm[ylab], x))(yy.ravel()),
            # ]).T/100.
            # f = bicop.pdf(u).reshape(xx.shape)
            kernel = gaussian_kde(df_bm[[xlab,ylab]].values.T, bw_method=0.5)
            f = np.reshape(kernel(positions).T, xx.shape)
            cmap = 'Blues' if  c0.copula != 'Independence' else 'Greens'
            ax.contourf(xx, yy, f, cmap=cmap, alpha=0.5)

        # plot events
        x, y = df_bm[xlab].values, df_bm[ylab].values
        xcooc, ycooc = df_obs_cooc[xlab].values, df_obs_cooc[ylab].values
        ax.scatter(xcooc, ycooc, s=13, color='darkorange', label='compound', zorder=2)
        ax.scatter(x, y, s=12, color='black', label='other', alpha=0.9, zorder=1)
        if r == n-1 and c == n-2:
            ax.legend(title='AM events', loc='lower left', bbox_to_anchor=(1.1, 1.1))
        kwargs = dict(        
            horizontalalignment='center',
            verticalalignment='top',
            transform=ax.transAxes,
            bbox=dict(facecolor='white', alpha=0.5, lw=0.1)
        )
        tau, ptau = kendalltau(x, y)
        tsig = ('**' if ptau < 0.05 else '*') if ptau < 0.10 else ''
        txt = f'{c0.copula} ' + r'($\tau$='+f'{tau:.2f}{tsig})'
        ax.text(0.5, 0.95, txt, **kwargs)

        if c == 0:
            ax.set_ylabel(labels[ylab])
        else:
            ax.set_yticklabels('')
        if r == n-1:
            ax.set_xlabel(labels[xlab])
        ax.set_ylim([ymin, ymax])
        ax.set_xlim([xmin, xmax])
    
# plt.savefig(join(r'../FIGURES', 'copula_autofit.png'), dpi=300, bbox_axes='tight')

## Create stochastic event set

In [None]:
from scipy.interpolate import interp1d
from scipy.stats import gaussian_kde

class emperical_dist(object):
    def __init__(self, data, nyears):
        self.data = np.sort(data)[::-1]  # descending
        nevents = data.size
        self.freq = np.arange(1,nevents+1)/(nevents+1)*(nevents/nyears)
        self.data = self.data[self.freq<1]
        self.freq = self.freq[self.freq<1]

    def ppf(self, q):
        return interp1d(
            1-self.freq, 
            self.data, 
            bounds_error = False, 
            fill_value=(self.data[-1], self.data[0])
        )(q)
        
    def pdf(self, x, **kwargs):
        return gaussian_kde(self.data, **kwargs)(x)

In [None]:
# use marginal distributions to transform quantiles back to normal space
extr_dist = {}
for dvar in df_eva.index:
    params = df_eva.loc[dvar, dparams].dropna()
    dist = df_eva.loc[dvar, 'dist']
    extr_dist[dvar] = get_frozen_dist(params, dist)

In [None]:
df_coastrp = pd.read_csv(join(ddir, 'annual_T_convolution_coastrp_local.csv'), index_col=['T'])
df_coastrp['annual_exc_prob'] = 1/df_coastrp.index
df_coastrp['cdf'] = 1-df_coastrp['annual_exc_prob']
coastrp_cdf = interp1d(df_coastrp['cdf'], df_coastrp['local_coastrp'], bounds_error = False, fill_value = df_coastrp['local_coastrp'].max()+0.01) #We interpolate the empirical CDF

In [None]:
#Surge from COAST-RP
fileS = 'Beira_STORM_surges.nc' #We look at the 3000 yr of data
da_surge = xr.open_dataarray(join(ddir, fileS))
extr_dist['s'] = emperical_dist(da_surge.values.flatten(), nyears=3000)

ds_coastrp = xr.open_dataset(join(ddir, 'COAST-RP.nc'))
ds_coastrp = ds_coastrp.where(ds_coastrp['station_id']=='id_coast_glob_18328', drop=True).set_coords('station_id').squeeze()
da =xr.concat([ds_coastrp[dvar] for dvar in ds_coastrp.data_vars], dim='rps').rename('storm_tide')
da['rps'] = xr.IndexVariable('rps', [int(dvar.split('_')[-1]) for dvar in ds_coastrp.data_vars])
coastrp = da.to_series()

In [None]:
tide_hr = dsh_hr['t'].resample(time='1H', label='left').first()
tide_hr_wdw = tide_hr.rolling(time=72).construct('window').isel(time=slice(72,-72))
tide_hr_wdw = tide_hr_wdw.where(~np.isnan(tide_hr_wdw).any('window'), drop=True)

In [None]:
surge_hr = dsh_hr['s'].resample(time='1H', label='left').first()
surge_hr_wdw = get_peak_hydrographs(surge_hr, get_peaks(surge_hr, period='3D'), 72).load().rename({'time': 'window'})

In [None]:
dsurge = xr.DataArray(dims=['window'], data=np.hstack([np.linspace(0,1,25), np.linspace(1-1/46,0,47)]))
(tide_hr_wdw.isel(time=4)+dsurge*2).max('window')

In [None]:
# high_tide = dsh_hr['t'].load().resample(time='29.5D').max()
# df_tide=high_tide.dropna('time').to_series()
# df_tide = dsh_hr['t'].load().dropna('time').to_series()

n=100
surge = xr.DataArray(dims=['time'], data=np.repeat(df_sim[['s']].values,n))
idx = np.random.randint(0, tide_hr_wdw.time.size, surge.size)
tide = tide_hr_wdw.isel(time=idx).drop('time')
h = (tide+dsurge*surge).max('window').values

fig, ax = plt.subplots(1,1)
# coast_rp
x = df_bm['h'].values
rps = _get_return_periods(x)
ax.scatter(rps, x, color='purple', label='reanalysis')

# x = df_coastrp['local_coastrp'].values
# rps = df_coastrp.index.values
# ax.scatter(rps, x, color='m', label='local_coastrp')

# ax.plot(coastrp.index, coastrp.values, '--k', label='coastrp')

rps = _get_return_periods(h)
ax.plot(rps, h, '.g', label='simulated')
ax.set_xscale('log')
ax.set_xlabel('return period [years]')
ax.set_ylabel('surge [m]')
ax.set_ylim([3.5, 7])
ax.legend()
ax.grid()

In [None]:
# Sample from the copula
random_state=1234
seeds = np.arange(len(drivers), dtype=int)
n_sim = 10000
df_usim = pd.DataFrame(data=cop.simulate(n_sim, seeds=seeds), columns=drivers)

# transform extremes based on dvar marginal
df_sim = pd.DataFrame()
for dvar in drivers:
    df_sim[dvar] = extr_dist[dvar].ppf(df_usim[dvar].values).round(2)


In [None]:
# combine co-occurence and simulations
from itertools import cycle
event_cycle = cycle(df_events.groupby('year'))
sim_events = []
for i, row in df_sim.iterrows():
    event_yr = next(event_cycle)[1][drivers] 
    events = (event_yr * row)
    events['year'] = int(i+1)
    sim_events.append(events)

df_sim_events = pd.concat(sim_events, axis=0, ignore_index=True)
df_sim_events_am = df_sim_events>0
df_sim_events_am['year'] = df_sim_events['year']
df_sim_events.head()

In [None]:
# # to sample non-extreme values use time lag window around events where
# # another driver is extreme, but dvar is not
xval_norm = {}
for dvar in drivers:
    # dates = df_peaks.index[df_peaks[dvar].isna()]
    # if dvar not in ['s', 'w']:
    #     xval = ds[dvar].rolling(time=max_time_lag[dvar].days).construct('window').sel(time=dates)
    # else:
    #     xval = dsh_hr[dvar].load().rolling(time=6*24*max_time_lag[dvar].days).construct('window').sel(time=dates)
    # xval = xval.stack({'time1':('time', 'window')}).to_series().dropna()
    # xval_norm[dvar] = xval
    xval_norm[dvar] = ds[dvar].where(ds[dvar]<ds[dvar].quantile(0.8),drop=True).to_series().dropna()

# # sample random value around extremes with non-extreme dvar
s = df_sim_events.index.size
for dvar in drivers:
    x = xval_norm[dvar].sample(s, replace=True, random_state=random_state).values
    df_sim_events[dvar] = df_sim_events[dvar].where(df_sim_events_am[dvar]>0, x)

df_sim_events.head()

In [None]:
from stats_eva import get_peak_hydrographs
hydrographs = {}
for dvar in ['s']:
    da = dsh_hr[dvar].load()
    hydrographs[dvar] = get_peak_hydrographs(da, get_peaks(da, period=period), 6*24*2).mean('peak').to_series()
df_hydro = pd.DataFrame(hydrographs)
df_hydro.index = df_hydro.index/6
df_hydro.plot()

In [None]:
# get tide timeseries
high_tide = dsh_hr['t'].load().resample(time='12H').max()
df_tide1=high_tide.dropna('time').to_series()
# df_tide = ds['t'].dropna('time').to_series()

# high_tide = dsh_hr['t'].resample(time='14D').max()
# df_tide = high_tide.dropna('time').to_series()

# high_tide = ds['t']
# df_tide_am = high_tide.where(high_tide>high_tide.quantile(0.98)).dropna('time').to_series()
df_tide_am = high_tide.resample(time='AS').max('time').dropna('time').to_series()
# df_tide_am = da_tide_am.where(~np.isin(da_tide_am.time.dt.year,drop_years)).dropna('time').to_series()


fn = join('../02_data', 'reanalysis_gtsm_v1_beira_extended.nc', 'tide_peaks.csv')
df_tide = pd.read_csv(fn, index_col = 'index', parse_dates = True)['high_tide']
df_tide_am = df_tide.resample('AS').max().iloc[:-2]
df_tide, df_tide1

In [None]:
# one dependent water component
hname = 's'

delta_s = 1.0#hydrographs['s'].sample(n_sim, replace=True, random_state=random_state).values
tide = df_tide.sample(n_sim, replace=True, random_state=random_state+1).values
h = tide + df_sim[hname].values*delta_s
df_sim['h0'] = h

# at least AM tide for events with extreme
tide_am = df_tide_am.sample(n_sim, replace=True, random_state=random_state).values
h = np.maximum(tide_am, h)

# df_sim_events['h'] = 0
# df_sim_events.loc[df_sim_events_am[hname]>0, 'h'] = h
df_sim['h'] = h

In [None]:
# more than one dependent water component
# combine surge, waves with random tide

s = df_sim_events.index.size
tide = df_tide.sample(s, replace=True, random_state=random_state).values
delta_s = hydrographs['s'].sample(s, replace=True, random_state=random_state+10).values
delta_w = hydrographs['w'].sample(s, replace=True, random_state=random_state).values
surge = np.maximum(0, df_sim_events['s'].values*delta_s)
wave_setup = (df_sim_events['w']*0.2).values
wave_setup = np.where(df_sim_events_am['w'].values>0, wave_setup*delta_w, wave_setup)
# df_sim_events['h'] = 0
df_sim_events['h'] = tide + surge + wave_setup 
print(surge.max(), wave_setup.max(), (surge+wave_setup).max())

# at least AM tide for events with extreme
imax = df_sim_events[['h', 'year']].groupby('year').idxmax().values.flatten()
tide_am = df_tide_am.sample(n_sim, replace=True, random_state=random_state).values
df_sim_events.loc[imax, 'h'] = np.maximum(tide_am, df_sim_events.loc[imax, 'h'])

df_sim['h'] = df_sim_events.loc[imax, 'h'].values

In [None]:
da_surge.values.size

In [None]:

# qs = np.linspace(0,1,10001)
# x0 = np.quantile(extr_dist['s'].data, qs)
# x1 = np.quantile(df_sim['s'], qs)

# fig, ax = plt.subplots(1,1)
# ax.scatter(x0,x1)
# ax.plot([0,4],[0,4], '--k')
# ax.set_xlabel('simulated surge [m]')
# ax.set_ylabel('coastrp surge [m]')
# ax.set_title('QQ-plot surge')

# plt.savefig(join(r'../FIGURES', 'surge_qq.png'), dpi=300, bbox_axes='tight')

fig, ax = plt.subplots(1,1)
x0 = np.sort(da_surge.values)
rps0 = _get_return_periods(x0, extremes_rate=x0.size/3000)

x1 = np.sort(df_sim['s'].values)
rps1 = _get_return_periods(x1)
ax.plot(rps0, x0, '.k', label='coastrp')
ax.plot(rps0, x0, '.k', label='coastrp')
ax.plot(rps1, x1, '.g', label='simulated')
ax.set_xscale('log')
ax.set_xlabel('return period [years]')
ax.set_ylabel('surge [m]')
ax.legend()
ax.grid()

plt.savefig(join(r'../FIGURES', 'surge_rps.png'), dpi=300, bbox_axes='tight')

In [None]:
dvar = 'h'
# from stats_eva import _get_return_periods

fig, ax = plt.subplots(1,1)

# coast_rp
x = df_coastrp['coast_rp'].values
rps = df_coastrp.index.values
rps_gumb = -np.log(-np.log(1.0 - 1.0 / rps))
ax.scatter(rps_gumb, x, color='purple', label='coastrp')

x = df_coastrp['local_coastrp'].values
rps = df_coastrp.index.values
rps_gumb = -np.log(-np.log(1.0 - 1.0 / rps))
ax.scatter(rps_gumb, x, color='m', label='local_coastrp')

# simulated
x_sim = df_sim['h0'].values
rps_sim = _get_return_periods(x_sim)
rps_sim_gumb = -np.log(-np.log(1.0 - 1.0 / rps_sim))
ax.scatter(rps_sim_gumb, x_sim, color='g', label='simulated (uncorrected)')

x_sim = df_sim['h'].values
rps_sim = _get_return_periods(x_sim)
rps_sim_gumb = -np.log(-np.log(1.0 - 1.0 / rps_sim))
ax.scatter(rps_sim_gumb, x_sim, color='k', label='simulated (corrected)')

# plot_return_values(df_bm[dvar].values, ax=ax, color='g')
ax.legend()
# ax.set_xlim([1,500])
ax.set_ylim([3,7])

_rps = np.array([1.1,2,5,10,20,50,100,500])
_rps_gumb = -np.log(-np.log(1.0 - 1.0 / _rps))
ax.set_xticks(_rps_gumb)
ax.set_xticklabels([f'{rp:.1f}' for rp in _rps])
ax.set_xlim([rps_gumb.min()*1.2, _rps_gumb[-1]*1.2])
ax.set_xlabel("Return period [years]")
ax.set_ylabel("Waterlevel [m+MSL]")
ax.grid()

plt.savefig(join(r'../FIGURES', 'waterlevel_events1.png'), dpi=300, bbox_axes='tight')


In [None]:
# dvars = ['qb', 'qp', 'p', 'h']
# df_obs_events = df_peaks0_filled[dvars].dropna(how='any')
# print(len(df_obs_events))
# rm = {k:v for k,v in labels.items() if k in dvars}
# df_merged = pd.concat([
#     df_sim_events[dvars],
#     df_obs_events, 
#     ], ignore_index=True, axis=0).rename(columns=rm)
# colors = np.hstack([
#     np.full(df_sim_events.index.size, "k"),
#     np.full(df_obs_events.index.size, "r"), 
#     ])
# sizes = np.hstack([
#     np.full(df_sim_events.index.size, 5),
#     np.full(df_obs_events.index.size, 30), 
#     ])
# axes = pd.plotting.scatter_matrix(df_merged, color=colors, s=sizes, alpha=1.0, diagonal='kde', figsize=(10,10))
# for i, dvar in enumerate(dvars):
#     axes[i,i].lines.pop(0)
#     xmin, xmax = df_merged[rm[dvar]].quantile([0.0, 0.998]).values
#     rp1 = extr_dist[dvar].ppf(1-1/1.1)
#     for j in range(len(dvars)):
#         axes[j,i].set_xlim([xmin, xmax])
#         axes[j,i].axvline(rp1, color='c', ls='-', label='rp1')#, lw=0.5)
#         if j != i: 
#             axes[i,j].set_ylim([xmin, xmax])
#             axes[i,j].axhline(rp1, color='c', ls='--')#, lw=0.5)
#     # fix ticks ..
#     xticks = axes[-1,i].get_xticks()
#     xticks = xticks[np.logical_and(xticks>xmin, xticks<xmax)]
#     axes[-1,i].set_xticks(xticks)
#     axes[-1,i].set_xticklabels(xticks, rotation=0)
#     if i == 0:
#         ymin,ymax = axes[0,0].get_ylim()
#         axes[i,i].set_yticks(xticks*(ymax-ymin)/(xmax-xmin))
#         axes[i,i].set_yticklabels(xticks)
#     df_sim_events[dvar].plot.kde(ax=axes[i,i], color='k', label='sim', legend=i==0)
#     df_obs_events[dvar].plot.kde(ax=axes[i,i], color='r', label='obs', legend=i==0)
#     if i == 0:
#         axes[i,i].set_ylabel(rm[dvar])
#     else:
#         axes[i,i].yaxis.set_visible(False)

In [None]:
dvars = ['qb', 'qp', 'p', 'h']
rm = {k:v for k,v in labels.items() if k in dvars}

df_sim = df_sim#events.groupby('year').max()
df_merged = pd.concat([
    df_sim[dvars],
    df_bm[dvars], 
    ], ignore_index=True, axis=0).rename(columns=rm)
colors = np.hstack([
    np.full(df_sim.index.size, "k"),
    np.full(df_bm.index.size, "r"), 
    ])
sizes = np.hstack([
    np.full(df_sim.index.size, 5),
    np.full(df_bm.index.size, 30), 
    ])
axes = pd.plotting.scatter_matrix(df_merged, color=colors, s=sizes, alpha=1.0, diagonal='kde', figsize=(10,10))
for i, dvar in enumerate(dvars):
    axes[i,i].lines.pop(0)
    xmin, xmax = df_merged[rm[dvar]].quantile([0.0, 0.995]).values
    for j in range(len(dvars)):
        axes[j,i].set_xlim([xmin, xmax])
        if j != i: 
            axes[i,j].set_ylim([xmin, xmax])
    # fix ticks ..
    xticks = axes[-1,i].get_xticks()
    xticks = xticks[np.logical_and(xticks>xmin, xticks<xmax)]
    axes[-1,i].set_xticks(xticks)
    axes[-1,i].set_xticklabels(xticks, rotation=0)
    if i == 0:
        ymin,ymax = axes[0,0].get_ylim()
        axes[i,i].set_yticks(xticks*(ymax-ymin)/(xmax-xmin))
        axes[i,i].set_yticklabels(xticks)
    df_sim[dvar].plot.kde(ax=axes[i,i], color='k', label='sim', legend=i==0)
    df_bm[dvar].plot.kde(ax=axes[i,i], color='r', label='obs', legend=i==0)
    if i == 0:
        axes[i,i].set_ylabel(rm[dvar])
    else:
        axes[i,i].yaxis.set_visible(False)

In [None]:
idxmax = df_sim_events.groupby('year').idxmax()
for dvar in df_sim_events.columns:
    if dvar not in extr_dist: continue
    df_sim_events[f'{dvar}_rp'] = 1.0
    imax = idxmax[dvar].values
    df_sim_events.loc[imax, f'{dvar}_rp'] = np.minimum(500, 1/extr_dist[dvar].sf(df_sim_events.loc[imax, dvar]))
# df_sim_events[df_sim_events['year'] == 15]

In [None]:
# get all scenarios used for linear interpolating damages
rps = np.array([1,2,5,10,50,100,500], dtype=int)
cols = ['qb_rp', 'qp_rp', 'p_rp', 'h_rp']
scens = []

values = df_sim_events.loc[:,cols].values
for event in values:
    rps_lst = []
    for rp in event:
        if rp in rps:
            rps_lst.append([int(rp), int(rp)])
        else:
            rps_lst.append([rps[rps<rp][-1], rps[rps>rp][0]])
    for rps0 in zip(*rps_lst):
        scens.append(list(rps0))
# # include univariate
# rps0 = np.zeros(4, dtype=int)
# for i in range(4):
#     for rp in rps[1:]:
#         _rps = rps0.tolist()
#         _rps[i] = rp
#         scens.append(_rps)
# # include full dependence
# for rp in rps:
#     scens.append(np.full(4, rp, dtype=int).tolist())
##
df_scen = pd.DataFrame(data=np.vstack(scens), columns=cols).value_counts().rename('count').reset_index()
df_scen['scen'] = [
    f"qb{qb_rp:03d}_qp{qp_rp:03d}_h{h_rp:03d}_p{p_rp:03d}" 
    for i, (qb_rp, qp_rp, p_rp, h_rp) in df_scen[cols].iterrows()
]
df_scen
print(df_scen.index.size)
# df_scen.to_csv(fn0.replace('.csv', '_scenCount.csv'))
df_scen.sort_values('count', ascending=False).head(20)
