In [None]:
import pandas as pd
from datetime import datetime, timedelta
import xarray as xr
import numpy as np
from os.path import join, isfile
from copy import deepcopy

In [None]:
import matplotlib.pyplot as plt
from mpl_toolkits.axes_grid1 import make_axes_locatable
from plot_tools import *
import seaborn as sns
from string import ascii_uppercase as letters

In [None]:
root = r'/scratch/compound_hotspots'
ddir = join(root, 'data', '4-postprocessed')
fdir = join(root, 'reports', 'figures')

In [None]:
q = 95
min_dist = 30
Npeaks = 50
rp=2

In [None]:
attrs_fn = join(ddir, 'rivmth_mean_attrs.csv')
attrs = pd.read_csv(attrs_fn, index_col='index').rename(columns={'rivmth_lat':'lat', 'rivmth_lon':'lon'})
np.unique(attrs['gtsm_station_id']).size

In [None]:
attrs_fn = join(ddir, 'rivmth_mean_attrs.csv')
attrs = pd.read_csv(attrs_fn, index_col='index').rename(columns={'rivmth_lat':'lat', 'rivmth_lon':'lon'})
dropcols = ['gtsm_lat','gtsm_lon', 'gtsm_station_id', 'lat', 'lon', 'rivmth_idx', 'dist2coast',
            'Qmsl_amax', 'Qmsl_amin', 'Qmsl_mean', 'Q_amin', 'Htiderange_amin', 'Htiderange_amax', 'Hsurge_amax']
attrs = attrs.drop(columns=dropcols)
attrs = attrs.reindex(sorted(attrs.columns), axis=1)
for col in attrs.columns:
    if col.startswith('Q') or col in ['uparea']:
        attrs[col] = np.log10(attrs[col])

In [None]:
dim='ensemble'
fn_peaks_ratio = join(ddir, f'rivmth_peaks_q{q}d{min_dist}_rp_ratio2.nc')
ds_rp = xr.open_dataset(fn_peaks_ratio).sel(rp=rp)
N = ds_rp[dim].size
ds_rp_mean = ds_rp.mean(dim)
ds_rp_dir =  xr.ufuncs.fabs(xr.ufuncs.sign(ds_rp).sum(dim)) == N
ds_rp_sign = xr.ufuncs.fabs(ds_rp_mean / ds_rp.std(dim)) > (2 / xr.ufuncs.sqrt(N-1))
df_rp_sign = np.logical_and(ds_rp_sign, ds_rp_dir).reset_coords(drop=True).to_dataframe()
df_rp = ds_rp.mean('ensemble').reset_coords(drop=True).to_dataframe()
df_rp = df_rp.where(df_rp_sign)

In [None]:
fn_peaks_freq = join(ddir, f'rivmth_peaks_q{q}d{min_dist}_top50_all_freq.nc')
ds_freq = xr.open_dataset(fn_peaks_freq)
da_lst = []
for scen in ds_freq.scen.values:
    da = ds_freq['peak_perc'].sel(scen=scen).drop('scen')
    da.name = f'peak_perc_{scen}'
    da_lst.append(da)
df_freq = xr.merge(da_lst).mean('ensemble').reset_coords(drop=True).to_dataframe()

In [None]:
df0 = pd.concat([
    attrs,
    df_rp,
    df_freq
], axis=1)

predictants_dict = {
    'peak_perc_surge': 'Hsurge (perc)',
#     'diff_surge_seas': 'Hsurge (diff Hseas)',
    'ratio_surge_seas': 'Hsurge (ratio Hseas)',
#     'diff_surge_tide': 'Hsurge (diff Htide)',
    'ratio_surge_tide': 'Hsurge (ratio Htide)',
    'peak_perc_seas': 'Hseas (perc)',
#     'diff_seas_tide': 'Hseas (diff Htide)',
    'ratio_seas_tide': 'Hseas (ratio Htide)',
    'peak_perc_tide': 'Htide (perc)'
}
predictants = list(predictants_dict.keys())
predictors = attrs.columns.values.tolist()


In [None]:
from scipy.stats import pearsonr, zscore
import pandas as pd
def nanzscore(x):
    z = np.ones_like(x)
    valid = np.isnan(x)==False
    z[valid] = zscore(x[valid])
    return z 

def calculate_pvalues(df):
    df = df.dropna()._get_numeric_data()
    dfcols = pd.DataFrame(columns=df.columns)
    pvalues = dfcols.transpose().join(dfcols, how='outer')
    for r in df.columns:
        for c in df.columns:
            pvalues[r][c] = pearsonr(df[r], df[c])[1]
    return pvalues

In [None]:
corr.values.min(), corr.values.max()

In [None]:
# Compute the correlation matrix
alpha=0.05
corr = df0.copy().corr()
pval = calculate_pvalues(df0.copy())
# sign = (calculate_pvalues(df)<alpha).astype(int)
df_ = pd.DataFrame(index=corr.index, columns=corr.columns)
sign = df_.fillna('x').where(pval<alpha, '')
pval = pval.loc[predictants, predictors].rename(predictants_dict)
corr = corr.loc[predictants, predictors].rename(predictants_dict)
sign = sign.loc[predictants, predictors].rename(predictants_dict)

# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(len(predictors)*1.5, len(predictants)*1.5))

# Generate a custom diverging colormap
cmap = sns.diverging_palette(220, 10, as_cmap=True)

# Draw the heatmap with the mask and correct aspect ratio
vmax=0.6
vmin=-0.6
norm=BoundaryNorm(np.linspace(vmin, vmax, 21), cmap.N)
ax = sns.heatmap(corr, annot=sign, fmt='s', cmap=cmap, vmin=vmin, vmax=vmax, norm=norm, center=0,
            square=True, linewidths=.5, cbar_kws={'label': 'pearson rho [-]', "shrink": .8})
ax.set_ylim([0, len(predictants)])
f.tight_layout()
fn = join(fdir, 'attrs_corr_heatmap.png')
plt.savefig(fn, dpi=300)

In [None]:
# f, ax = plt.subplots(figsize=(11, 5))
df_z = df0.apply(nanzscore, axis=0)
df = df0.where(df_z.abs()<=4)
g = sns.pairplot(df, x_vars=predictors, y_vars=predictants[::-1], plot_kws=dict(facecolor='grey', edgecolor=None))
fn = join(fdir, 'attrs_pair_scatter.png')
plt.savefig(fn, transparent=True, dpi=225)