# This is an experimental notebook with charts currently under development

In [None]:
from AFEP_parse import *
from alchemlyb.preprocessing import subsampling
import os
from scipy.signal import correlate
import numpy as np
import matplotlib.pyplot as plt
import scipy as sp
from scipy.optimize import curve_fit, leastsq

# User parameters
## IMPORTANT: Make sure the temperature matches the temperature used to run the simulations.

In [None]:
#path='/home/ezry/winHome/Documents/ELIC_Data/PCPGPE211/Se'
path='/home/ems363/Documents/ELIC_DCDs_Analyses/PCPGPE211/POEG_104_withZ/'
filename='PO*.fepout'

temperature = 303.15
decorrelate = False #Flag for automatic decorrelation
detectEQ = False #Flag for automatic equilibrium detection

In [None]:
RT = 0.00198720650096 * temperature
fepoutFiles = glob(path+filename)
fepoutFiles = natsorted(fepoutFiles)
maxSize = 10**9 #Don't use the alchemlyb parser if larger than this size. (bytes)
totalSize = 0
for file in fepoutFiles:
    totalSize += os.path.getsize(file)
print(f"Will process {len(fepoutFiles)} fepout files with total size:{np.round(totalSize/10**9, 2)}GB")

if totalSize>maxSize:
    print(f"Warning: The files you are trying to read are quite large. Total size={totalSize}.\nTry the read, decorrelate, save method in the Expanded version of this notebook or increase the maxSize variable above.\nIn the future, consider using less frequent sampling (e.g. every 100 steps).")

# Read and process files
See Shirts and Chodera (2008) for more details

"Statistically optimal analysis of samples from multiple equilibrium states" doi: 10.1063/1.2978177

In [None]:
u_nk, affix = readAndProcess(fepoutFiles, temperature, decorrelate, detectEQ) #u_nk stores the fep data, affix is a string for meaningful file naming

In [None]:
perWindow, cumulative = doEstimation(u_nk)

In [None]:
changeAndError = f'\u0394G = {np.round(cumulative.BAR.f.iloc[-1]*RT, 1)}\u00B1{np.round(cumulative.BAR.errors.iloc[-1], 3)} kcal/mol'
print(changeAndError)

# Plot the change in free energy based on MBAR estimates

In [None]:
fig, (cumAx, eachAx) = plt.subplots(2,1, sharex=True)
# Cumulative change in kcal/mol
cumAx.errorbar(cumulative.index, cumulative.BAR.f*RT, yerr=cumulative.BAR.errors, marker=None, linewidth=1)
cumAx.set(ylabel=r'Cumulative $\rm\Delta G_{\lambda}$'+'\n(kcal/mol)')

# Per-window change in kcal/mol
eachAx.errorbar(perWindow.index, perWindow.BAR.df*RT, yerr=perWindow.BAR.ddf, marker=None, linewidth=1)
eachAx.plot(perWindow.index, perWindow.EXP.dG_f*RT, marker=None, linewidth=1, alpha=0.5)
eachAx.errorbar(perWindow.index, -perWindow.EXP.dG_b*RT, marker=None, linewidth=1, alpha=0.5)
eachAx.xlabel(r'$\lambda$')
eachAx.ylabel(r'$\rm\Delta G_{\lambda}$'+'\n(kcal/mol)')

fig.set_figwidth(4)
fig.set_figheight(8)
fig.tight_layout()
plt.savefig(f'{path}dG_{affix}.pdf')
plt.savefig(f'{path}dG_{affix}.png', dpi=600)
plt.show()

# Plot the estimated total change in free energy as a function of simulation time; contiguous subsets starting at t=0 ("Forward") and t=end ("Reverse")

In [None]:
def convergencePlot(theax, fs, ferr, bs, berr, fwdColor='#0072B2', bwdColor='#D55E00', lgndF=None, lgndB=None):
    if not lgndF:
        lgndF=fwdColor
        lgndB=bwdColor
        
    theax.errorbar(np.arange(len(fs))/len(fs)+0.1, fs, yerr=ferr, marker='o', linewidth=1, color=fwdColor, markerfacecolor='white', markeredgewidth=1, markeredgecolor=fwdColor, ms=5)
    theax.errorbar(np.arange(len(bs))/len(fs)+0.1, bs, yerr=berr, marker='o', linewidth=1, color=bwdColor, markerfacecolor='white', markeredgewidth=1, markeredgecolor=bwdColor, ms=5, linestyle='--')

    theax.xaxis.set_ticks([0, 0.2, 0.4, 0.6, 0.8, 1])
    
    finalMean = fs[-1]
    theax.axhline(y= finalMean, linestyle='-.', color='gray')
    theax.plot(0, finalMean, linewidth=1, color=lgndF, label='Forward Time Sampling')
    theax.plot(0, finalMean, linewidth=1, color=lgndB, linestyle='--', label='Backward Time Sampling')
    
    return theax

def doConvPlot(ax, X, fs, ferr, fwdColor, label=None):
    ax.errorbar(X, fs, yerr=ferr, marker=None, linewidth=1, color=fwdColor, markerfacecolor='white', markeredgewidth=1, markeredgecolor=fwdColor, ms=5, label=label)
    return ax

def moving_average(x, w):
    return np.convolve(x, np.ones(w), 'same') / w

def subSample(unkGrps, lowPct, hiPct):
    partial = []
    for key, group in unkGrps:
        idcs = group.index.get_level_values(0)
        
        lowBnd = np.percentile(idcs, lowPct, method='closest_observation')
        hiBnd = np.percentile(idcs, hiPct, method='closest_observation')
        mask = np.logical_and(idcs<=hiBnd, idcs>=lowBnd) 
        sample = group.loc[mask]
        if len(sample)==0:
            print(f"ERROR: no samples in window {key}")
            print(f"Upper bound: {hiBnd}\nLower bound: {lowBnd}")
            raise
            
        partial.append(sample)

    partial = pd.concat(partial)
    
    return partial
    
    
def doConvergence(u_nk, tau=1):
    groups = u_nk.groupby('fep-lambda')

    #return data_list
    
    forward = []
    forward_error = []
    backward = []
    backward_error = []
    num_points = 10
    for i in range(1, num_points+1):
        # forward
        partial = subSample(groups, 0, 100*i/num_points)
        estimate = BAR().fit(partial)
        l, l_mid, f, df, ddf, errors = get_BAR(estimate)
        
        forward.append(f.iloc[-1])
        forward_error.append(errors[-1])
        
        partial = subSample(groups, 100*(1-i/num_points), 100)
        estimate = BAR().fit(partial)
        l, l_mid, f, df, ddf, errors = get_BAR(estimate)
        
        backward.append(f.iloc[-1])
        backward_error.append(errors[-1])

    return forward, forward_error, backward, backward_error

    
def altConvergence(u_nk, nbins):
    groups = u_nk.groupby('fep-lambda')

    #return data_list
    
    forward = []
    forward_error = []
    backward = []
    backward_error = []
    num_points = nbins
    for i in range(1, num_points+1):
        # forward
        partial = subSample(groups, 100*(i-1)/num_points, 100*i/num_points)
        estimate = BAR().fit(partial)
        l, l_mid, f, df, ddf, errors = get_BAR(estimate)
        
        forward.append(f.iloc[-1])
        forward_error.append(errors[-1])

    return np.array(forward), np.array(forward_error)

## Two alternative convergence plots based on block-average like subsampling

In [None]:
nbins=100
fs, Err = altConvergence(trimmed, nbins)

In [None]:
fig, ax = plt.subplots(1,1)
lspace = np.linspace(0,1,nbins+1)
X = (lspace[1:]+lspace[:-1])/2
#doConvPlot(ax, X, np.multiply(fs,RT), np.multiply(Err,RT), "#310023") 
doConvPlot(ax, X, np.multiply(fs,RT), None, "#310023", label="blocked averages") 

#ax.plot(X, moving_average(fs, 4)*RT, label='moving average')

cumMean = np.cumsum(fs)/np.arange(1,len(fs)+1)*RT
ax.plot(X, cumMean, label='cumulative mean')

cumStd = [RT*np.std(fs[:i]) for i in range(0,len(fs))]
ax.fill_between(X, cumMean+cumStd, cumMean-cumStd, alpha=0.3, label='std of cumulative mean')

ax.legend()

ax.xaxis.set_ticks([0, 0.2, 0.4, 0.6, 0.8, 1])

finalMean = f.iloc[-1]*RT
ax.axhline(y= finalMean, linestyle='-.', color='gray')
#ax.plot(0, finalMean, linewidth=1, color=fwdColor)
ax.set(xlim=(0,1))
#ax.legend()
ax.set(ylabel=r'$\rm\Delta G_{\lambda}$'+'\n(kcal/mol)', xlabel='Fraction of simulation time')
#plt.savefig(f'{path}convergence_BlocksWAve_n{nbins}{affix}.pdf')
#plt.savefig(f'{path}convergence_BlocksWAve_n{nbins}{affix}.png', dpi=600)
plt.show()

In [None]:
fwdColor='#0072B2'
bwdColor='#D55E00'
fig, ax = plt.subplots(1,1)

nbins = 10
fs, Err = altConvergence(u_nk, nbins)
lspace = np.linspace(0,1,nbins+1)
X = (lspace[1:]+lspace[:-1])/2
doConvPlot(ax, X, fs*RT, Err*RT, fwdColor)

nbins = 5
fs, Err = altConvergence(u_nk, nbins)
lspace = np.linspace(0,1,nbins+1)
X = (lspace[1:]+lspace[:-1])/2
doConvPlot(ax, X, fs*RT, Err*RT, bwdColor)

nbins=3
fs, Err = altConvergence(u_nk, nbins)
lspace = np.linspace(0,1,nbins+1)
X = (lspace[1:]+lspace[:-1])/2
doConvPlot(ax, X, fs*RT, Err*RT, "#310023")

ax.xaxis.set_ticks([0, 0.2, 0.4, 0.6, 0.8, 1])

finalMean = f.iloc[-1]
ax.axhline(y= finalMean*RT, linestyle='-.', color='gray')
#ax.plot(0, finalMean, linewidth=1, color=fwdColor)
ax.set(xlim=(0,1))
#ax.legend()
ax.set(ylabel=r'$\rm\Delta G_{\lambda}$'+'\n(kcal/mol)', xlabel='Fraction of simulation time')
plt.savefig(f"{path}_multiScaleBlockAves_{affix}.pdf")
plt.savefig(f"{path}_multiScaleBlockAves_{affix}.png", dpi=600)
plt.show()

## Traditional convergence plots

In [None]:
forward, fwdErr, backward, bwdErr = doConvergence(u_nk)
fig, ax = plt.subplots(1,1)
convergencePlot(ax, np.array(forward)*RT, np.array(fwdErr)*RT, np.array(backward)*RT, np.array(bwdErr)*RT)
ax.legend()
ax.set(ylabel=r'$\rm\Delta G_{\lambda}$'+'\n(kcal/mol)', xlabel='Fraction of simulation time')
plt.savefig(f'{path}_ES_Convergence_{affix}.pdf')
plt.savefig(f'{path}_ES_Convergence_{affix}.png', dpi=600)

In [None]:
convergence_plot(u_nk, l)
plt.savefig(f"{path}_currentConvergence_{affix}.pdf")


# Use an exponential estimator to assess residual discrepancies and check for hysteresis

The PDF is estimated with Scipy's Gaussian Kernel with automatic bandwidth determination

In [None]:
fig, (histAx, pdfAx) = plt.subplots(1, 2, sharey=True, gridspec_kw={'width_ratios': [3, 1]})
X, Y, pdfX, pdfY, fitted, pdfXnorm, pdfYnorm, pdfYexpected = getPDF(dG_f, dG_b)

diff = RT*perWindow.EXP['diff']
histAx.plot(l_mid, diff, linewidth=2)
histAx.set(xlabel=r'$\lambda$', ylabel=r'$\delta_\lambda$ (kcal/mol)')
plt.ylim(-1,1)


pdfY, edges = np.histogram(diff, bins=20, density=True)
pdfX = (edges[1:] + edges[:-1])/2
#pdfAx.plot(pdfY, pdfX, label="histogrammed")


kernel = sp.stats.gaussian_kde(diff)
samples = np.linspace(-1, 1, 1000)
pdfAx.plot(kernel(samples), samples, label='KDE')

pdfAx.set(xlabel="KDE")
std = np.std(X)
mean = np.average(X)
mode = pd.Series(kernel(samples), index=samples).idxmax()
textstr = r"$\rm{mode_{KDE}=}$"+f"{np.round(mode,2)}"+"\n"+fr"$\mu$={np.round(mean,2)}"+"\n"+fr"$\sigma$={np.round(std,2)}"
props = dict(boxstyle='square', facecolor='white', alpha=0.5)
hystAx.text(0.35, 0.95, textstr, transform=hystAx.transAxes, fontsize=14,
        verticalalignment='top', bbox=props)

fig.tight_layout()

plt.savefig(f"{path}discrepancies_{affix}.pdf")
plt.savefig(f"{path}discrepancies_{affix}.png", dpi=600)
plt.show()

In [None]:
plt.vlines(perWindow.index, np.zeros(len(perWindow)), perWindow.EXP['difference'], label="fwd - bwd", linewidth=2)

plt.legend()
plt.title(f'Fwd-bwd discrepancies by lambda {affix}')
plt.xlabel('Lambda')
plt.ylabel('Diff. in delta-G (kT)')
plt.savefig(f'{path}discrepancies_{affix}.png', dpi=600)



#plot the data
fig, pdfAx = plt.subplots(1, 1)
plt.xlabel('Difference in delta-G')

pdfAx.plot(pdfX, pdfY,  label="Estimated Distribution")
pdfAx.set_ylabel("PDF")

fig.set_figheight(5)
pdfAx.title.set_text(f"Estimated PDF (fwd-bkwd)\nSkewness: {np.round(skew(X),2)}\nPopulation parameters: Mean={np.round(np.average(X),3)}, Stdv={np.round(np.std(X),3)}")
plt.savefig(f"{path}pdf_{affix}.png", dpi=600)


plt.show()