In [None]:
import xarray as xr
from os.path import join
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

## read peaks data

In [None]:
# data directory
ddir = r'../../1_data/2_forcing'
rdir = r"../../4_results"

# data labels
labels = {
    'qb': 'Discharge Buzi\n[m3/s]',
    'qp': 'Discharge Pungwe\n[m3/s]',
    'p': 'Rainfall\n[mm/hr]',
    't': 'Tide\n[m+MSL]',
    's': 'Surge\n[m]',
    'w': 'Sign. wave height\n[m]',
    'h_ts': 'Total waterlevel\n[m+MSL]',
    'h_tsw': 'Total waterlevel (incl. wave setup)\n[m+MSL]',
    'ss': 'Skew surge\n[m]',
    'ssw': 'Skew surge (incl. wave setup)\n[m]',
    'sw': 'Non-tidal residual\n[m]'
}

In [None]:
# read timeseries and peaks data
period='AS-AUG'

ds = xr.open_dataset(join(ddir, 'beira_drivers_daily.nc'))
df_peaks0 = pd.read_csv(join(rdir, 'drivers_am_peaks.csv'), index_col=0, parse_dates=['time'])
df_bm = df_peaks0.resample(period).max().dropna()

drivers = ['p', 'qb', 'qp', 's', 'w']

In [None]:
# read distributions
from eva import get_frozen_dist, rps_dist, emperical_dist, _RPS

dist_params = pd.read_csv(join(rdir, 'marginal_params.csv'), index_col=0).rename({'h_tsw': 'h_tsw0'})
dists = {}
for dvar, row in dist_params.iterrows():
    params = row[-2:] if row[0] == 'gumb' else row[-3:]
    dists[dvar] = get_frozen_dist(params, row[0])

# surge
df_surge_emp_dist = pd.read_csv(join(rdir, 'marginal_surge.csv'), index_col=0)
dists['s'] = rps_dist(df_surge_emp_dist['rp[year]'].values, df_surge_emp_dist['surge[m]'].values)

## analyse co-occurence

In [None]:
# NOTE that `max_time_lag` is the timelag between two consecutive extremes
# the total time lag between all four events could theoretically be the sum of all lagtime per driver
## settings
max_time_lag = {
    # 'qp': pd.Timedelta('5D'),
    # 'qb': pd.Timedelta('5D'),
}
dt0 = pd.Timedelta('30D')  # default time lag
slag = ''#'_lag2d'

# reduce daily events to selected drivers
df_peaks = df_peaks0[drivers].dropna(how='all')

# combine daily events to event sets based on maximum time_lag
df_peaks['dt'] = np.hstack([[0], np.diff(df_peaks.index)])
df_peaks['max_dt'] = (df_peaks[drivers].notna()*np.array([max_time_lag.get(d,dt0) for d in drivers])).max(axis=1)
# df_peaks['max_dt'] = [pd.Timedelta(days=d) for d in df_peaks['max_dt'].dt.days.rolling(2, min_periods=1).max().values]
df_peaks['event'] = np.cumsum(~(df_peaks['dt']<df_peaks['max_dt']))
df_events = df_peaks.groupby('event').apply(lambda x: x.notna().sum()).drop(columns=['dt', 'event'])
df_events['n_extreme'] = df_events[drivers].sum(axis=1)
df_events['time'] = df_peaks.reset_index().groupby('event').first()['time']
offset = pd.Timedelta(days=df_bm.index.dayofyear.min()-1)
df_events['year'] = (df_events['time'] - offset).dt.year
df_events['time_lag'] = [evnt.iloc[1:,:]['dt'].sum() for _, evnt in df_peaks.groupby('event')]
evnts, cnts = np.unique(df_peaks['event'], return_counts=True)
df_peaks['co-occur'] = np.isin(df_peaks['event'], evnts[cnts>1])

# make barcode plot for co-occuring events
data = df_events[drivers[::-1]].T * df_events['n_extreme']
_, ax = plt.subplots(1,1, figsize=(12,5))
im = ax.imshow(data.where(data>0), interpolation='none', cmap='Blues', vmin=0, vmax=len(drivers))
ax.set_aspect(2)
ax.set_yticks(np.arange(len(drivers)))
ax.set_yticklabels([labels[dvar].split('\n')[0] for dvar in drivers[::-1]])
# fig.colorbar(im, orientation='vertical')
ax.set_xlabel('events timeline')
xlabs = df_events.reset_index().groupby('year').first()[['event']]
_ = ax.set_xticks(xlabs['event'].values[::2]-1.5)
_ = ax.set_xticklabels(xlabs.index.values[::2], rotation=45)

plt.tight_layout()
# plt.savefig(join(r'../../4_results', f'co-occurence_time{slag}.png'), dpi=300, bbox_axes='tight')


# remove incomplete years
check = df_events[['n_extreme', 'year']].groupby('year').sum()
drop_years = check[check['n_extreme']!=len(drivers)].index.values
if drop_years.size > 1:
    df_events = df_events[~np.isin(df_events['year'], drop_years)]
    print(f'ignore years with more/less events than drivers: {drop_years}')
print(f'no. of valid years: {np.unique(df_events.year).size}')
print(f'no. of events: {df_events.index.size}; (>1 extreme: {df_events[df_events.n_extreme>1].index.size})')
print(f'max time lag: {df_events.time_lag.max().days} days')


In [None]:
df_events_dist = df_events.groupby(drivers + ['n_extreme']).size().reset_index(name='count')
df_events_dist = df_events_dist.sort_values('count', ascending=False)


fig, (ax1, ax) = plt.subplots(2,1, figsize=(7,3), sharex=True, gridspec_kw={'height_ratios': [2, 1], 'hspace':0.0})
colors = ['k' for _ in range(5)] + ['orange' for _ in range(df_events_dist.index.size-5)]
df_events_dist.reset_index()['count'].plot.bar(ax=ax1, color=colors)
ax1.set_ylabel('count [-]')

x, y = np.where(df_events_dist[drivers])
colors = ['k' for _ in range(5)] + ['orange' for _ in range(x.size-5)]
ax.scatter(x, y, color=colors)
ax.set_ylim([-0.5, len(drivers)-0.5])
ax.set_yticks(np.arange(len(drivers)))
ax.set_yticklabels([labels[dvar].split('\n')[0] for dvar in drivers])
ax.set_xlabel('event type')
ax.set_xticklabels('')

plt.tight_layout()

# plt.savefig(join(r'../../4_results', f'co-occurence_hist{slag}.png'), dpi=300, bbox_axes='tight')

## dependence modelling

In [None]:
import pyvinecopulib as pv

# Transform copula data using the empirical distribution
df_uobs = pd.DataFrame(columns=drivers, data = pv.to_pseudo_obs(df_bm[drivers].values))
# family_set, postfix = [pv.BicopFamily(0)], '_indep' # independent only
family_set, postfix = [pv.BicopFamily(x) for x in np.arange(0,11)], ''

# fit copula
controls = pv.FitControlsVinecop(
    family_set=family_set, 
    # parametric_method='itau',
    # nonparametric_method='quadratic'
    threshold=0.05, # tau threshold   (0.0 default!)
    selection_criterion='aic', # loglik, bic, aic (bic default!)
    # tree_criterion='tau',  # tau, hoeffd, rho, mcor -> no effect ?!
    show_trace=True,
)
cop = pv.Vinecop(data=df_uobs.values, controls=controls)#, structure=pv.RVineStructure(len(drivers)))
# cop.structure

In [None]:
cop.structure

In [None]:
data = []
m = cop.matrix
n = m.shape[0]
v = drivers
for t in range(n-1):
    # print(f'** Tree: {t:d}')
    for e in range(n-1-t):
        p1, p2 = v[int(m[n-1-e,e]-1)], v[int(m[t,e]-1)]
        px = [v[int(p-1)] for p in m[:t,e]]
        c = cop.get_pair_copula(t,e)
        tau = cop.get_tau(t,e)  # NOTE: diffferent from scipy.stats method ?
        pxs = f' | ' + ','.join(px) if px else ''
        edge = f'{p1},{p2}{pxs}'
        cstr = c.str().replace(f'\n',',')
        print(f'{p1},{p2}{pxs}: {cstr}; tau = {tau:.5f}')
        data.append([t+1, e, edge, [p1,p2], px, c.str().split(',')[0], c.parameters.flatten(), tau])

df_cop = pd.DataFrame(
    data=data,
    columns=['tree', 'edge#', 'edge', 'pair', 'conditional', 'copula', 'parameters', 'tau']
)
df_cop.set_index(['tree', 'edge#']).to_csv('copula.csv')


In [None]:
# Sample from the copula
seeds = np.arange(len(drivers), dtype=int)
np.random.RandomState(np.sum(seeds))
n_sim = 30000
df_usim, postfix = pd.DataFrame(data=cop.simulate(n_sim, seeds=seeds), columns=drivers), '_30k'
df_usim, postfix = pd.DataFrame(data=np.random.random((n_sim, len(drivers))), columns=drivers), '_indep1_30k'
df_usim, postfix = pd.DataFrame(data=np.tile(np.random.random((n_sim, 1)), len(drivers)), columns=drivers), '_fulldep1_30k'

# transform back based on marginal distributions
df_sim = pd.DataFrame()
s = df_usim.index.size
for dvar in drivers:
    df_sim[dvar] = np.maximum(0, dists[dvar].ppf(df_usim[dvar].values))

# df_sim_q = pd.DataFrame()
# df_sim_q[dvar] = np.quantile(df_bm[dvar], df_usim[dvar])
    
df_sim.head()

## create stochastic event set

In [None]:
# combine co-occurence and simulations
from itertools import cycle
event_cycle = cycle(df_events.groupby('year'))
sim_events = []
for year, row in df_sim[drivers].iterrows():
    event_yr = next(event_cycle)[1][drivers] 
    events = (event_yr * row)
    events['year'] = year+1
    sim_events.append(events)

df_sim_events0 = pd.concat(sim_events, axis=0, ignore_index=True)
df_sim_events = df_sim_events0
df_sim_events0.head()

In [None]:
# required for simulating h_tws & plot comparing sim with obs events
xval_norm = {}
thresh_lst = df_sim.quantile(0.01)
s = df_sim_events0.index.size
df_sim_events = df_sim_events0.copy(deep=True)
for dvar in drivers:
    # we randomly sample non-extremes from all values below the rp1 threshold
    thresh = dists[dvar].ppf(1-1/1.01)
    xval_norm = ds[dvar].where(ds[dvar] < thresh,drop=True).to_series().dropna()
    x0 = xval_norm.sample(s, replace=True, random_state=np.sum(seeds)) 
    xs = df_sim_events0[dvar]
    df_sim_events[dvar] = np.where(xs==0, x0, xs)

In [None]:
# combine waterlevel components
tide = ds['t'].dropna('time').to_series()  # daily high tide
tide_am = ds['t'].resample(time='AS').max('time').dropna('time').to_series()
# combine surge with random daily high tide
df_sim_events['t'] = tide.sample(s, replace=True, random_state=np.sum(seeds)).values

hname = 'h_tsw' if 'w' in ds else 'h_ts' 
df_sim_events[hname] = df_sim_events['t'] + df_sim_events['s'] 
if 'w' in ds:
    df_sim_events[hname] = df_sim_events[hname] + 0.2 * df_sim_events['w']
# at least AM tide for events with extreme
htot_am_idx = df_sim_events[[hname, 'year']].groupby('year').idxmax().values.flatten()
h_am = np.maximum(df_sim_events[hname], tide_am.sample(s, replace=True, random_state=np.sum(seeds)).values)
df_sim_events.loc[htot_am_idx, hname] = np.maximum(df_sim_events.loc[htot_am_idx, hname], h_am.loc[htot_am_idx])

# get AM waterlevel
df_sim_am_idx = df_sim_events.groupby('year').idxmax()
df_sim_am = df_sim_events.groupby('year').max()
df_sim_am.head()

In [None]:
from eva import emperical_dist
# get h rps
df_sim_am0 = pd.read_csv(join(rdir, 'sim_AM.csv'), index_col=0)
dists['h0'] = emperical_dist(df_sim_am0['h_tsw'].values, df_sim_am0['h_tsw'].size)
n, nyears = df_sim_events.index.size, df_sim_am.index.size
rps = (1/(1-np.arange(n)/n)*(nyears/n))
df_sim_events.loc[df_sim_events['h_tsw'].sort_values().index, 'h_tsw'] = dists['h0'].ppf(1-1/rps)
df_sim_am['h_tsw'] = df_sim_events.loc[df_sim_am_idx['h_tsw'].values, 'h_tsw'].values


In [None]:
# save results important to keep return values of h_tsw consistent with sensitivity analysis
df_sim_am.round(3).to_csv(join(rdir, f'sim_AM{postfix}{slag}x.csv'))
df_sim_events.round(3).to_csv(join(rdir, f'sim_EVENTS{postfix}{slag}x.csv'))

### plot stochastic event set

In [None]:
# get h rps
df_sim_am0 = pd.read_csv(join(rdir, 'sim_AM.csv'), index_col=0)
dists['h_tsw'] = emperical_dist(df_sim_am0['h_tsw'].values, df_sim_am0['h_tsw'].size)

RPS = np.concatenate([[1.1], _RPS])
df_rps = pd.DataFrame(columns=dists.keys(), index=RPS)
for dvar in dists:
    df_rps[dvar] = dists[dvar].ppf(1-1/RPS)

df_rps.index = np.floor(RPS).astype(int)
df_rps.index.name = 'rps'
df_rps.to_csv(join(rdir, f'marginal_rps.csv'))
df_rps

In [None]:
from scipy.stats import kendalltau, pearsonr, gaussian_kde
n = len(drivers)-1
fig, axes = plt.subplots(n,n, sharex=False, sharey=False, figsize=(n*2.5,n*2.5), gridspec_kw={'hspace':0.0, 'wspace':0.0})

# df_uobs_cooc = df_uobs.where(df_peaks[drivers][df_peaks['co-occur']].resample(period).max().reset_index(drop=True).notna())
df_obs_cooc = df_peaks[drivers][df_peaks['co-occur']].resample(period).max()

for r in range(n):
    for c in range(n):
        if c > r:
            axes[r,c].set_visible(False)
            continue

for _, c0 in df_cop.iterrows():
    c = c0.edge 
    r = c + c0.tree
    ax = axes[r,c]
    xlab, ylab = c0.pair
    x, y = df_bm[xlab].values, df_bm[ylab].values
    xmin, xmax = np.quantile(x, [0,0.95])
    ymin, ymax = np.quantile(y, [0,0.95])
    ymax = ymax + (ymax-ymin)*0.4
    
    # if len(c0) > 0 and c0.copula != 'Independence':
    xx, yy = np.mgrid[xmin:xmax:100j, ymin:ymax:100j]
    positions = np.vstack([xx.ravel(), yy.ravel()])
    # kernel = gaussian_kde(df_bm[[xlab,ylab]].values.T, bw_method=0.5)
    kernel = gaussian_kde(df_sim[[xlab,ylab]].values.T, bw_method=0.5)
    f = np.reshape(kernel(positions).T, xx.shape)
    cmap = 'Blues' if  c0.copula != 'Independence' else 'Greens'
    ax.contourf(xx, yy, f, cmap=cmap, alpha=0.5)

    # plot events
    x, y = df_bm[xlab].values, df_bm[ylab].values
    xcooc, ycooc = df_obs_cooc[xlab].values, df_obs_cooc[ylab].values
    ax.scatter(xcooc, ycooc, s=13, color='darkorange', label='compound', zorder=2)
    ax.scatter(x, y, s=12, color='black', label='other', alpha=0.9, zorder=1)
    if r == n-1 and c == n-2:
        ax.legend(title='AM events', loc='lower left', bbox_to_anchor=(1.1, 1.1))
    kwargs = dict(        
        horizontalalignment='left',
        verticalalignment='top',
        transform=ax.transAxes,
        bbox=dict(facecolor='white', alpha=0.5, lw=0.1)
    )
    # tau, ptau = kendalltau(x, y)
    # rho, prho = pearsonr(x, y)
    # tsig = ('**' if ptau < 0.05 else '*') if ptau < 0.10 else ''
    # rsig = ('**' if prho < 0.05 else '*') if prho < 0.10 else ''
    # name = c0.copula if c0.copula != 'Independence' else 'Indep.'
    cs = ' | ' + ','.join(c0.conditional) if c0.tree > 0 else ''
    ps = ','.join(c0.pair[::-1])
    ns = f'{c0.copula}' + r' ($\tau$=' + f'{c0.tau:.2f})' if c0.copula != 'Independence' else c0.copula
    txt = f'[{ps}{cs}]\n{ns}' #{tsig}; ' + r'$\rho$='+f'{rho:.2f}{rsig})'
    ax.text(0.02, 0.98, txt, **kwargs)

    if c == 0:
        ax.set_ylabel(labels[ylab].replace('\n', f' ({ylab})\n'))
    else:
        ax.set_yticklabels('')
    if r == n-1:
        ax.set_xlabel(labels[xlab].replace('\n', f' ({xlab})\n'))
    ax.set_ylim([ymin, ymax])
    ax.set_xlim([xmin, xmax])

# plt.savefig(join(r'../../4_results', f'copula_autofit{postfix}.png'), dpi=300, bbox_axes='tight')

In [None]:
# get maximum values within time window for non_extremes
df_peaks0_filled = pd.DataFrame()
for dvar in df_peaks0.columns:
    df_peaks0_filled[dvar] = df_peaks0[dvar].where(df_peaks0[dvar].notna(), ds[dvar].sel(time=df_peaks0.index))
    

dvars = drivers + [hname]
rm = {k:v for k,v in labels.items() if k in dvars}
rm_lab = {k: v.replace(' (','\n(') for k,v in rm.items()}
df_obs_events = df_peaks0_filled[dvars].dropna(how='any')
df_merged = pd.concat([
    df_sim_events[dvars],
    df_obs_events, 
    ], ignore_index=True, axis=0).rename(columns=rm)
colors = np.hstack([
    np.full(df_sim_events.index.size, "k"),
    np.full(df_obs_events.index.size, "r"), 
    ])
sizes = np.hstack([
    np.full(df_sim_events.index.size, 5),
    np.full(df_obs_events.index.size, 30), 
    ])


In [None]:
axes = pd.plotting.scatter_matrix(df_merged.rename(columns=rm_lab), color=colors, s=sizes, alpha=1.0, diagonal=None, figsize=(10,10))
for i, dvar in enumerate(dvars):
    axes[i,i].lines.pop(0)
    xmin, xmax = df_merged[rm[dvar]].quantile([0.0, 0.998]).values
    rp1 = dists[dvar].ppf(1-1/1.01)
    # rp1 = thresh_lst[dvar]
    for j in range(len(dvars)):
        axes[j,i].set_xlim([xmin, xmax])
        axes[j,i].axvline(rp1, color='c', ls='-', label='rp1')#, lw=0.5)
        if j != i: 
            axes[i,j].set_ylim([xmin, xmax])
            axes[i,j].axhline(rp1, color='c', ls='--')#, lw=0.5)
    # fix ticks ..
    xticks = axes[-1,i].get_xticks()
    xticks = xticks[np.logical_and(xticks>xmin, xticks<xmax)]
    axes[-1,i].set_xticks(xticks)
    axes[-1,i].set_xticklabels(xticks, rotation=0)
    if i == 0:
        ymin,ymax = axes[0,0].get_ylim()
        axes[i,i].set_yticks(xticks*(ymax-ymin)/(xmax-xmin))
        axes[i,i].set_yticklabels(xticks)
    df_sim_events[dvar].plot.kde(ax=axes[i,i], color='k', label='sim', legend=i==0)
    df_obs_events[dvar].plot.kde(ax=axes[i,i], color='r', label='obs', legend=i==0)
    if i == 0:
        axes[i,i].set_ylabel(rm[dvar])
    else:
        axes[i,i].yaxis.set_visible(False)

for r in range(len(dvars)):
    for c in range(len(dvars)):
        if c >= r:
            axes[r,c].set_visible(False)
            continue

In [None]:
df_merged = pd.concat([
    df_sim_am[dvars],
    df_bm[dvars], 
    ], ignore_index=True, axis=0).rename(columns=rm_lab)
colors = np.hstack([
    np.full(df_sim_am.index.size, "k"),
    np.full(df_bm.index.size, "r"), 
    ])
sizes = np.hstack([
    np.full(df_sim_am.index.size, 8),
    np.full(df_bm.index.size, 30), 
    ])
axes = pd.plotting.scatter_matrix(df_merged, color=colors, s=sizes, alpha=1.0, diagonal=None, figsize=(10,10))
for i, dvar in enumerate(dvars):
    xmin, xmax = df_merged[rm_lab[dvar]].quantile([0.02, 0.998]).values
    for j in range(len(dvars)):
        axes[j,i].set_xlim([xmin, xmax])
        if j != i: 
            axes[i,j].set_ylim([xmin, xmax])
    # fix ticks ..
    xticks = axes[-1,i].get_xticks()
    xticks = xticks[np.logical_and(xticks>xmin, xticks<xmax)]
    axes[-1,i].set_xticks(xticks)
    axes[-1,i].set_xticklabels(xticks, rotation=0)
    if i == 0:
        ymin,ymax = axes[0,0].get_ylim()
        axes[i,i].set_yticks(xticks*(ymax-ymin)/(xmax-xmin))
        axes[i,i].set_yticklabels(xticks)
    
    # axes[i,i].lines.pop(0)
    # df_sim_am[dvar].plot.kde(ax=axes[i,i], color='k', label='sim', legend=i==0)
    # if dvar != 's':
    #     df_bm[dvar].plot.kde(ax=axes[i,i], color='r', label='obs', legend=i==0)
    # else:
    #     df_bm[dvar][:-2].plot.kde(ax=axes[i,i], color='r', label='obs', legend=i==0)
    # if i == 0:
    #     axes[i,i].set_ylabel(rm_lab[dvar])
    # else:
    #     axes[i,i].yaxis.set_visible(False)
    
for r in range(len(dvars)):
    for c in range(len(dvars)):
        if c >= r:
            axes[r,c].set_visible(False)
            continue

axes[1,0].plot(0,0, f'.r', label='sim')
axes[1,0].plot(0,0, f'.k', label='obs')
axes[1,0].legend()

plt.tight_layout()
plt.subplots_adjust(left=0.1,right=1,bottom=0.1,top=1, wspace=0, hspace=0)
plt.savefig(join(r'../../4_results', f'stochastic_events_AM{postfix}.png'), dpi=300, bbox_axes='tight')
