# Import necessary libraries
SAFEP_parse.py contains all the functions and library calls necessary to run the notebook
# Required libraries:
- numpy
- pandas
- matplotlib
- alchemlyb (pip install git+https://github.com/alchemistry/alchemlyb)
- natsort (for sorting file names)
- glob (for unix-like file paths)



In [None]:
from AFEP_parse import *
plt.rcParams['figure.dpi'] = 150

In [None]:
import os

In [None]:
path='../../path/to/fepouts'
filename='*.fep*'
fepoutFiles = glob(path+filename)
temperature = 300
RT = 0.00198720650096 * temperature # ca. 0.59kcal/mol
totalSize = 0
for file in fepoutFiles:
    totalSize += os.path.getsize(file)
print(f"Will process {len(fepoutFiles)} fepout files.\nTotal size:{np.round(totalSize/10**9, 2)}GB")

# For large data sets: read, decorrelate, save
This reduces RAM requirements between reading and decorrelating

Remember: pickles are not future-proof and should not be used for long-term data storage

In [None]:
from alchemlyb.preprocessing import subsampling

In [None]:
method = 'dE'
affix = f'decorrelated_{method}'

pickles = []
idx = 0

for file in tqdm(fepoutFiles):
    df = readFiles([file])
    u_nk = u_nk_fromDF(df, temperature, 0, warnings=False)

    u_nk = subsampling.decorrelate_u_nk(u_nk, method)
    pickle = f"{path}{affix}{idx:03d}.pkl"
    u_nk.to_pickle(pickle)
    pickles.append(pickle)
    idx +=1

In [None]:
pickleDFs = []
for pickle in pickles:
    pickleDFs.append(pd.read_pickle(pickle))

u_nk = pd.concat(pickleDFs)

In [None]:
#u_nk = u_nk.drop(0.975, axis=1) #to remove incomplete windows e.g. when changing lambda resolutions

# Demonstration of equivalence between the above and below methods

In [None]:
## Demonstrate that AFEP readFiles+u_nk_fromDF is identical to namd.extract_u_nk
## readFiles is more space efficient and can handle single files. Only reads each file once. Less input validation than alchemlyb.namd.
#u_nk_target = namd.extract_u_nk(fepoutFiles[0:5], temperature)
#df = readFiles(fepoutFiles[0:5])
#u_nk_test = u_nk_fromDF(df, temperature, 10000)
#np.all(u_nk_target.fillna(100)==u_nk_test.fillna(100))

# Small data sets can be read and decorrelated sequentially, if desired
See Shirts and Chodera (2008) for more details

"Statistically optimal analysis of samples from multiple equilibrium states" doi: 10.1063/1.2978177

In [None]:
maxSize = 10**9 #Don't use the alchemlyb parser if larger than this size. (bytes)
if totalSize < maxSize:
    from alchemlyb.preprocessing import subsampling

    u_nk = namd.extract_u_nk(fepoutFiles, temperature)

    method = 'dE'
    affix = f'decorrelated_{method}'
    #affix = 'unprocessed'

    groups = u_nk.groupby('fep-lambda')
    decorr = pd.DataFrame([])
    for key, group in groups:
        test = subsampling.decorrelate_u_nk(group, method)
        decorr = decorr.append(test)

    u_nk = decorr
else:
    print(f"Warning: The files you are trying to read are quite large. Total size={totalSize}. Try reading and decorrelating (above) or change the maxSize parameter.")

# Carry out MBAR Fitting and Analyses

In [None]:
bar = BAR()
bar.fit(u_nk)

# Extract key features from the MBAR fitting and get ΔG
Note: alchemlyb operates in units of kT by default. We multiply by RT to conver to units of kcal/mol.

In [None]:
l, l_mid, f, df, ddf, errors = get_MBAR(bar)
#print("Overall free energy",df.cumsum() * RT) #in unit of kcal/mol
#print("Errors", errors)

#Overall delta G*_site
#print('')
#print('')
print(f'\u0394G = {np.round((df.cumsum()*RT)[-1], 3)}\u00B1{np.round(errors[-1], 3)} kcal/mol')

# Plot the change in free energy based on MBAR estimates

In [None]:
# Cumulative change in kT
plt.errorbar(l, f, yerr=errors, marker='.')
plt.xlabel('lambda')
plt.ylabel('DeltaG(lambda) (kT)')
plt.title(f'Cumulative dG with accumulated errors {affix}')
plt.savefig(f'{path}dG_cumulative_kT_{affix}.png', dpi=600)
plt.show()

# Cumulative change in kcal/mol
plt.errorbar(l, f * RT, yerr=errors*RT, marker='.')
plt.xlabel('lambda')
plt.ylabel('DeltaG(lambda)(kcal/mol)')
plt.savefig(f'{path}dG_cumulative_kcal_per_mol_{affix}.png', dpi=600)
plt.show()

# Per-window change in kT
plt.errorbar(l_mid, df, yerr=ddf, marker='.')
plt.xlabel('lambda')
plt.ylabel('Delta G per window (kT)')
plt.title(f'Per-Window dG with individual errors {affix}')
plt.savefig(f'{path}dG_{affix}.png', dpi=600)
plt.show()


# Plot the estimated total change in free energy as a function of simulation time; contiguous subsets starting at t=0 ("Forward") and t=end ("Reverse")

In [None]:
convergence_plot(u_nk, l)
plt.title(f'Convergence {affix}')
plt.savefig(f'{path}convergence_{affix}.png', dpi=600)

# Use an exponential estimator to assess residual discrepancies and check for hysteresis

In [None]:
l, l_mid, dG_f, dG_b = get_EXP(u_nk)

In [None]:
plt.vlines(l_mid, np.zeros(len(l_mid)), dG_f + np.array(dG_b), label="fwd - bwd", linewidth=2)

plt.legend()
plt.title(f'Fwd-bwd discrepancies by lambda {affix}')
plt.xlabel('Lambda')
plt.ylabel('Diff. in delta-G')
plt.savefig(f'{path}discrepancies_{affix}.png', dpi=600)

# The above data should follow a roughly normal distribution centered on 0.

In [None]:
plt.hist(dG_f + np.array(dG_b));
plt.title(f'Distribution of fwd-bwd discrepancies {affix}')
plt.xlabel('Difference in delta-G')
plt.ylabel('Count')
plt.savefig(f'{path}distribution_{affix}.png', dpi=600)