In [None]:
import sys
sys.path.append('../python_packages_static/')
import pandas as pd
import pyemu
import matplotlib.pyplot as plt
import numpy as np
import os, glob, shutil
import geopandas as gp
import datetime as dt
import flopy as fp
from shapely.geometry import Point
from matplotlib.backends.backend_pdf import PdfPages

# Postprocessing of iES runs. There are two key decisions to make. Which iteration is the best (subjective) tradeoff between fit and variance in the ensemble and, given a chosen iteration, what is an appropriate cutoff for rejection sampling. These decisions lead to visualization of the observation fits and generation of the ensemble to be used with MODPATH for the source water area delineation.

In [None]:
rundir = '../run_data'                # directory in which runs took place
pstroot = 'never_ies_0.noise_lt_obs'  # PST file root for iES
outfolder = '{}/postproc/'.format(rundir)  # write out output files to this location
obs_data = '../obs_data'
drop_pdc = False

### Visualize the Phi of the whole ensemble as it evolves over the iterations

In [None]:
phi = pd.read_csv(os.path.join(rundir,"{}.phi.actual.csv".format(pstroot)),index_col=0)
plt.figure(figsize=rf.doublecolumn_size)
ax = phi['base'].apply(np.log10).plot(legend=False, lw=1.5, color='r', label='base')
phi.iloc[:,6:7].apply(np.log10).plot(legend=False,lw=0.5,color='k',alpha=0.15,label='realizations', ax = ax)
plt.legend(['base','realizations'])
phi.iloc[:,6:].apply(np.log10).plot(legend=False,lw=0.5,alpha=0.15,color='k', ax = ax)
phi['base'].apply(np.log10).plot(legend=False, lw=1.5, color='r', ax=ax)
plt.ylabel('log Phi')
plt.xlabel('iES iteration')
plt.xticks(ticks=np.arange(10))
ax.axes.tick_params(length=7, direction='in', right=True, top=True)
plt.legend(['base','realizations'], title='EXPLANATION', frameon=False, bbox_to_anchor =(0.97, 0.95))
plt.savefig('../report_materials/phi_history_iES.pdf')

In [None]:
phi

### a `bounds_report` helps visualize, for the base realization, how many parameters were at their bounds. This can help guide the decision of whether iterations are overfit or not.

In [None]:
df = pst.bounds_report()
df_tot =  df[[i for i in df.columns if 'either' in i]].loc['total'].copy()
df_tot.index = [int(i.split('_')[-1]) for i in df_tot.index]
df_tot.plot()
plt.xlabel('iteration')
plt.ylabel('total at bounds')
plt.grid()
plt.grid()

### we are choosing iteration 4 as best

In [None]:
best_iter = 4
pst = pyemu.Pst('{0}/{1}.pst'.format(rundir, pstroot), 
                resfile=os.path.join(rundir,'{}.{}.base.rei'.format(pstroot,best_iter)))
obs=pst.observation_data

### we can make a quick 1to1 plot of the base ensemble member

In [None]:
pst.plot(kind='1to1', filename='{0}/{1}.{2}_iter_{3}.pdf'.format(outfolder, pstroot, '.base.1to1', best_iter))


### here we decide whether or not we will visualize results which were in prior data conflict (PDC)

In [None]:
if drop_pdc is True:
    # read in the PDC list of values
    pdc = pd.read_csv(os.path.join(rundir,'{}.pdc.csv'.format(pstroot)))
    pdc.name = pdc.name.apply(lambda x: x.lower())
    # zero weight all the PDC values
    obs.loc[pdc.name.values, 'weight'] = 0

## plot residuals only for nonzero weight for the base realization which is read in automatically when `pyemu` reads in the pst control file

In [None]:
assert(np.unique(pst.res.index == obs.index))==np.array(True)

In [None]:
tmpres = pst.res.copy()
tmpres.weight = obs.weight


In [None]:
tmpres = tmpres.loc[pst.res.weight>0]
print(len(tmpres))
tmpres.head()

In [None]:
for cn,cg in tmpres.groupby('group'):
    fig, ax = plt.subplots(1,2, figsize=(8,4))
    ax[0].plot([0, 1], [0, 1],'--',transform=ax[0].transAxes)
    ax[0].scatter(cg.measured,cg.modelled, s=10)
    ax[0].set_xlabel('measured')
    ax[0].set_ylabel('modeled')
    ax[0].set_title('1to1')
    #ax[0].set_aspect('equal')
    ax[1].scatter(cg.measured, cg.residual, s=10)
    xlim = ax[1].get_xlim()
    ax[1].plot(xlim, [0,0], '--')
    ax[1].plot(xlim, [cg.residual.mean(),cg.residual.mean()], 'r-', lw=2)
    ax[1].set_title('residuals')
    plt.suptitle(cn)
    ax[1].set_xlabel('measured')
    ax[1].set_ylabel('residual')
    #ax[1].set_aspect('square')
    #plt.tight_layout()

# Rejection Sampling. We can look at the PHI histogram for the best iteration and assign a cutoff of `phi_too_high` which delineates where rejection takes place

In [None]:
phivec = phi.loc[best_iter][5:].copy()

In [None]:
phivec.hist(bins=50)

In [None]:
phi_too_high= 3200 # was previously 5500 on 11/2020

## the next couple cells seem extra involved, but they are meant to make fancy figures for the journal; article showing PHI evolution over iterations and the rejection sampling all together.

In [None]:
def add_subplot_axes(ax,rect,axisbg='w'):
    fig = plt.gcf()
    box = ax.get_position()
    width = box.width
    height = box.height
    inax_position  = ax.transAxes.transform(rect[0:2])
    transFigure = fig.transFigure.inverted()
    infig_position = transFigure.transform(inax_position)    
    x = infig_position[0]
    y = infig_position[1]
    width *= rect[2]
    height *= rect[3]  # <= Typo was here
    subax = fig.add_axes([x,y,width,height])
    x_labelsize = subax.get_xticklabels()[0].get_size()
    y_labelsize = subax.get_yticklabels()[0].get_size()
    x_labelsize *= rect[2]**0.5
    y_labelsize *= rect[3]**0.5
    subax.xaxis.set_tick_params(labelsize=x_labelsize)
    subax.yaxis.set_tick_params(labelsize=y_labelsize)
    return subax

In [None]:
fig = plt.figure(figsize=(5,4))
ax = phi['base'].apply(np.log10).plot(legend=False, lw=1.5, color='b', label='base')
phi.iloc[:,6:7].apply(np.log10).plot(legend=False,lw=0.5,color='k',alpha=0.15,label='realizations', ax = ax)
plt.legend(['base','realizations'])
phi.iloc[:,6:].apply(np.log10).plot(legend=False,lw=0.5,alpha=0.15,color='k', ax = ax)
phi['base'].apply(np.log10).plot(legend=False, lw=1.5, color='b', ax=ax)
plt.ylabel('log $\Phi$')

ax1 = add_subplot_axes(ax, [0.5,.32,.4,0.4])
ax1.axvline(phi_too_high, color='k', label='cutoff $\Phi$')
ax1.legend()
phivec = phi.loc[best_iter][5:].copy()
phivec.hist(bins=50, ax=ax1)
rf.title(ax,'Log $\Phi$ over iES iterations', capitalize=False, subplot_prefix='A')
rf.title(ax1,'Iteration 4 $\Phi$ histogram with rejection cutoff', wrap=39, capitalize=False, subplot_prefix='B')

#plt.savefig('phi_and_rejection.pdf')

In [None]:
phivec = phi.loc[best_iter][5:].copy()
fig, ax = plt.subplots(1,2, figsize=(rf.doublecolumn_size))
phivec.hist(bins=50, ax=ax[0])
ax[0].axvline(phi_too_high, color='k', label='cutoff PHI')
ax[0].legend()
ax[0].set_ylabel('Frequency')
print(len(phivec))
phivec = phivec[phivec<phi_too_high]
print(len(phivec))
phivec.hist(bins=50, ax=ax[1])
ax[1].set_xlim(ax[0].get_xlim())
ax[0].set_xlabel('Realization PHI')
ax[1].set_xlabel('Realization PHI')
rf.title(ax[0],'PHI distribution', capitalize=False, subplot_prefix='A')
rf.title(ax[1],'PHI distribution trimmed', capitalize=False, subplot_prefix='B')

handles, labels = ax[0].get_legend_handles_labels()
rf.legend(ax[0], handles, labels, bbox_to_anchor=(.8, 0.85))


#plt.savefig('../report_materials/Figure20_rejectionsampling.pdf')

### now we need to make a `reals_to_keep` vector that keeps track of the ensemble members that made it through rejection sampling

In [None]:
reals_to_keep = phivec.index.values

### in early stages of the project, there were cases where some model results with reasonable PHI had poor mass balance, so we rejected them as well. Now that is not a problem, but the logic is shown here nonetheless.

In [None]:
# truncate the ensemble to only reals_to_keep (based on phi)
ens = pd.read_csv(os.path.join(rundir,'{}.0.obs.csv'.format(pstroot)), index_col=0)

ens = ens.loc[reals_to_keep]
# set percent_discrepancy to absolute value because we don't care about the sign
ens.perc_disc = ens.perc_disc.apply(lambda x: np.abs(x))

In [None]:
# further truncate to eliminate bad mass balance runs 
print(len(ens))
ens = ens.loc[ens.perc_disc<0.01]
ens

In [None]:
ens.perc_disc.hist(bins=50)

In [None]:
# get rid of zero-weighted values dropped in the PDC
ens = ens[tmpres.index]

In [None]:
# reset keepreals based now both on phi and mass balance
reals_to_keep = ens.index.values

In [None]:
phivec = phivec.loc[reals_to_keep]

In [None]:
phivec.hist(bins=50)

In [None]:
base_ens = pd.read_csv(os.path.join(rundir,'{}.0.obs.csv'.format(pstroot)), index_col=0)
base_ens = base_ens.loc[reals_to_keep]
base_ens = base_ens[tmpres.index]
pyemu.plot_utils.ensemble_res_1to1(ens, pst, base_ensemble=base_ens)

In [None]:
pdc = pd.read_csv(os.path.join(rundir,'{}.pdc.csv'.format(pstroot)))
pdc.name = pdc.name.apply(lambda x: x.lower())
pdc.set_index('name', inplace=True, drop=True)
pdc

## plot without PDC 

In [None]:
drop_pdc = [i for i in pdc.index if not i.startswith('q_')] # make sure we still plot streamflow even if in PDC. 
        # should not be the case

In [None]:
ens = ens[[i for i in ens.columns if i not in drop_pdc]]
base_ens = base_ens[[i for i in ens.columns if i not in drop_pdc]]

### we can save out this observation ensemble

In [None]:
ens.to_csv('../notebooks_report/final_obs_ensembles/ies_post_lt_noise.obs.csv')

In [None]:
tmpres = tmpres.loc[[i for i in ens.columns if i not in drop_pdc]]

In [None]:
obs_ens = pd.read_csv(os.path.join(rundir,'{}.obs+noise.csv'.format(pstroot)), index_col=0)
obs_ens = obs_ens.loc[reals_to_keep]
obs_ens


## In this case, with sampling of observation noise, looking for overlap of distributions

In [None]:
with PdfPages(os.path.join(rundir, 'postproc','{}_trimmed_pdc_obs_hist_plots.pdf'.format(pstroot))) as outpdf:
    for cob in tmpres.index.values:
        plt.figure()
       
        obs_ens[cob].hist(bins=50, color='orange', edgecolor='none', alpha=.7,label='observed')
        ens[cob].hist(bins=50, edgecolor='none', label='modelled')
        plt.axvline(obs_ens[cob].mean(), mfc='k', alpha=.5,label='obs mean')
        plt.axvline(obs_ens[cob].mean()+obs_ens[cob].std(), mfc='k', ls=':', alpha=.5, label='obs + 1$\sigma$')
        plt.axvline(obs_ens[cob].mean()-obs_ens[cob].std(), mfc='k', ls=':', alpha=.5, label='obs - 1$\sigma$')
        
        
        plt.title(cob)
        plt.legend()
        outpdf.savefig()
        plt.close('all')
        

# Finally, after all the visualization, we save out the paremeter ensemble to supply to MODPATH for the final source water area delineation

In [None]:
par_ens = pd.read_csv(os.path.join(rundir,'{0}.{1}.par.csv'.format(pstroot, best_iter)), index_col=0)
par_ens.index = [str(i) for i in par_ens.index]
par_ens = par_ens.loc[reals_to_keep]
par_ens.to_csv(os.path.join(rundir, 'modpath_par_ens.csv'))