In [None]:
import glob
import sys
import os
import re

import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from scipy.stats import gaussian_kde

sys.path.append("/user/brussel/104/vsc10478/scripts/gmxtools")
from gmxtools.ndhist import NDHistogram

aminoacids1to3 = dict(
	A="ALA", C="CYS", D="ASP", E="GLU", F="PHE",
	G="GLY", H="HIS", I="ILE", K="LYS", L="LEU",
	M="MET", N="ASN", P="PRO", Q="GLN", R="ARG",
	S="SER", T="THR", V="VAL", W="TRP", Y="TYR",
)

aminoacids3to1 = {j: i for i, j in aminoacids1to3.items()}

%matplotlib inline

# Correlation plot

In [None]:
peptide = "ggSEPgGLNgg"

proj = NDHistogram.from_projectionfile(f"../7peptides/{peptide}/analy/proj.xvg", n_replicas=4)

In [None]:
mycolors = mpl.cm.jet(np.linspace(0, 1, 256))
mycolors[0,:] = [1., 1., 1., 0.]
mycmap =  mpl.colors.ListedColormap(mycolors)

# Build bins
N_POINTS = 50
vmin1, vmin2 = np.min(proj.data[:,:,:2], axis=(0,1))
vmax1, vmax2 = np.max(proj.data[:,:,:2], axis=(0,1))
delta1, delta2 = vmax1 - vmin1, vmax2 - vmin2
bins = (
    np.linspace(vmin1 - delta1/N_POINTS, vmax1 + delta1/N_POINTS, N_POINTS+2),
    np.linspace(vmin2 - delta2/N_POINTS, vmax2 + delta2/N_POINTS, N_POINTS+2)
)

# Build data, Normalize data
hist2d = []
for i in range(4):
    h, xe, ye  = np.histogram2d(proj.data[i,:,0], proj.data[i,:,1], bins=bins, density=True)
    hist2d.append(h.T)
hist2d = np.stack(hist2d)
#hist2d[hist2d == 0] = np.nan
#hist2d = -np.log(hist2d)
#hist2d[np.isnan(hist2d)] = np.nanmax(hist2d) + (np.nanmax(hist2d) - np.nanmin(hist2d)) / 255.
norma = mpl.colors.Normalize(np.nanmin(hist2d), np.nanmax(hist2d))


fig, axs = plt.subplots(2,2, figsize=(7,5), dpi=96, gridspec_kw={"hspace":.3, "wspace":.3})

for i, ax in enumerate(axs.flatten()):
    ax.grid(True, "major", "both", color="0.5", linestyle="--", alpha=.6)
    im =ax.imshow(hist2d[i], interpolation="gaussian", cmap=mycmap, norm=norma,
                  extent=(vmin1, vmax1, vmin2, vmax2,))
    ax.set_xlabel("PC #1")
    ax.set_xticks(np.arange(-2, 2.1, 1))
    ax.set_ylabel("PC #2")
    ax.set_yticks(np.arange(-2, 2.1, 1))
    
plt.colorbar(im, ax=axs)
    
#ig.savefig(f"figures/replica_pc1_pc2_{peptide}.png", dpi=300, bbox_inches="tight")

In [None]:
def gkern(l=5, sig=1.):
    """\
    creates gaussian kernel with side length `l` and a sigma of `sig`
    """
    ax = np.linspace(-(l - 1) / 2., (l - 1) / 2., l)
    gauss = np.exp(-0.5 * np.square(ax) / np.square(sig))
    kernel = np.outer(gauss, gauss)
    return kernel / np.sum(kernel)

gkern(7,2)

In [None]:
from scipy.signal import convolve2d

In [None]:
mycolors = mpl.cm.jet(np.linspace(0, 1, 256))
mycolors[0,:] = [1., 1., 1., 0.]
mycmap =  mpl.colors.ListedColormap(mycolors)

# Build bins
N_POINTS = 100
INDEX = 0
vmin1, vmin2 = np.min(proj.data[:,:,:2], axis=(0,1))
vmax1, vmax2 = np.max(proj.data[:,:,:2], axis=(0,1))
delta1, delta2 = vmax1 - vmin1, vmax2 - vmin2
bins = (
    np.linspace(vmin1 - delta1/N_POINTS, vmax1 + delta1/N_POINTS, N_POINTS+2),
    np.linspace(vmin2 - delta2/N_POINTS, vmax2 + delta2/N_POINTS, N_POINTS+2)
)

# Build data, Normalize data
hist2d, xe, ye  = np.histogram2d(proj.data[INDEX,:,0], proj.data[INDEX,:,1], bins=bins, density=True)

hist2d = convolve2d(hist2d, gkern(7,2), mode="same")

xvals, yvals = np.meshgrid((xe[1:] + xe[:-1]) / 2, (ye[1:] + ye[:-1]) / 2, indexing="xy")

fig, ax = plt.subplots(subplot_kw={"projection": "3d"}, dpi=300)

surf = ax.plot_surface(xvals, yvals, hist2d, cmap=mycmap,
                       linewidth=0, antialiased=True)

In [None]:
proj.data = proj.data[:,:,:5]
proj.make_nbins(nbins=5)
proj.make_histograms(first=0, last=14999)

i, j = 0, 3
arri, arrj = proj.hists[[i, j]] / 15000
msk = (arri > 0) | (arrj > 0)
arri, arrj = arri[msk], arrj[msk]

#sp = np.sum( (arri - np.mean(arri)) * (arrj - np.mean(arrj)) )
#sqi, sqj = np.sum((ai - np.mean(ai))**2), np.sum((aj - np.mean(aj))**2)
#corr.append( sp / np.sqrt(sqi * sqj) )

fig, ax = plt.subplots(1,1, figsize=(3,3), dpi=150)
ax.set_aspect("equal")
ax.grid(True, "major", "both", linewidth=.3, alpha=.5, color="k")
ax.scatter(arri, arrj, s=5, alpha=.3, color="#00339F")
ax.set_xlabel("Replica #1")
ax.set_xticks(np.arange(0, 0.015, 5e-3))
ax.set_xticks(np.arange(0, 0.014, 1e-3), minor=True)
ax.set_ylabel("Replica #2")
ax.set_yticks(np.arange(0, 0.015, 5e-3))
ax.set_yticks(np.arange(0, 0.014, 1e-3), minor=True)
#fig.savefig(f"figures/corr_{peptide}_{i}_{j}.png", dpi=300, bbox_inches="tight")

# Correlation of evolution

In [None]:
def read_replica_correff(filenames):
    """
    Read replica correlation coefficients into an array (N,M,K)
        N corresponds to the system in names
        M are the incremental time steps
        K are the replica comparisons
    """
    names, data, = [], np.zeros((len(filenames), 0, 6))
    for k,fname in enumerate(filenames):
        names.append(re.search(r"/(gg\w+gg)/", fname).group(1))
        arr = np.genfromtxt(fname)
        i,j = arr.shape
        if (np.array(data.shape) >= [k,i,j-1]).all():
            data[k,:i,:j-1] = arr[:,1:]
        else:
            l,m,n = data.shape
            data = np.pad(data, [(0, max(0,k-l)), (0, max(0,i-m)), (0, max(0,j-n-1))],
                          mode="constant", constant_values=np.nan)
            data[k,:i,:j] = arr[:,1:]
            if i > m: nframes = arr[:,0]
    return data, nframes, names


def get_final_corref(data):
    """ Gets the last correlation coefficient for each system and replica-replica pair """
    correfs = []
    for i, idata in enumerate(data):
        jarr, karr = np.where(~np.isnan(idata))
        correfs.append([
            idata[np.max(jarr[(karr == k)]), k]
            for k in np.unique(karr)
        ])
    return np.array(correfs)


def nanquantile(data, quantiles, axis):
    """ Get quantiles for continuous coreff values (filling NaNs with preceeding values) """
    _data = data.copy()
    fill_values = get_final_corref(_data)
    i,k,j = np.where(np.isnan(_data))
    _data[i,k,j] = fill_values[i,j]
    return np.nanquantile(_data, quantiles, axis=axis)
    

AMINO_ACID_COLORS = {
    "ALA": "#c8c8c8", "LEU": "#0f820f", "ASH": "#e60a0a",
    "ARG": "#145aff", "LYS": "#145aff", "GLH": "#e60a0a",
    "ASN": "#00dcdc", "MET": "#e6e600", "HIP": "#145aff",
    "ASP": "#e60a0a", "PHE": "#3232aa", "SEP": "#fa9600",
    "CYS": "#e6e600", "PRO": "#c8c8c8", "S1P": "#fa9600",
    "GLN": "#00dcdc", "SER": "#00dcdc", "TPO": "#fa9600",
    "GLU": "#e60a0a", "THR": "#00dcdc", "T1P": "#fa9600",
    "GLY": "#c8c8c8", "TRP": "#b45ab4", "PTR": "#fa9600",
    "HIS": "#8282d2", "TYR": "#3232aa", "Y1P": "#fa9600",
    "ILE": "#0f820f", "VAL": "#0f820f"
}

five_props = [
    [":", "-.", '-', "-.", ":"],
    [.8, .8, 1., .8, .8],
]

legend_entries = []
for label, colo in sorted(AMINO_ACID_COLORS.items()):
    legend_entries.append(
        mpl.lines.Line2D([0],[0], 
           marker='o', color='w', label=label, linewidth=0,
           markerfacecolor=colo, markeredgecolor=".2", markersize=10
    ))
fig, ax = plt.subplots(1,1, figsize=(3,1), dpi=150)
ax.legend(handles=legend_entries, loc="center", ncol=4)
ax.axis("off")
fig.savefig("figures/amino_colors.pdf", dpi=300, bbox_inches="tight")

# 5peptides

In [None]:
def plot_correff_timeseries(data, n_frames, names, *, from_time=0, until_time=100, plot_alpha=.5):
    
    fig, axs = plt.subplots(1,2, figsize=(4,2.2), dpi=150, sharey=True, gridspec_kw={"wspace":0, "width_ratios": [5,1]})
    
    quantiles = nanquantile(data, [.05, .25, .5, .75, .95], axis=(0,2))
    
    # FIRST AXIS
    ax = axs[0]
    ax.grid(True, "major", "both", color="k", linestyle=":", linewidth=.5, alpha=.5)
    # Plot quantiles
    for yvals, lstyle, lwidth in zip(quantiles, *five_props):
        ax.plot(n_frames, yvals, linewidth=lwidth, color="k", linestyle=lstyle, zorder=10)
    # Plot correlation coefficient time series
    for pname, yvals in zip(names, data):
        c1, c2 = AMINO_ACID_COLORS[pname[2:5]], AMINO_ACID_COLORS[pname[-5:-2]]
        ax.plot(n_frames, yvals, color=c1, linewidth=.8, linestyle="-", marker="p", markersize=1, alpha=plot_alpha)
    ax.set_xlim([from_time, until_time])
    ax.set_xticks(np.arange(from_time, until_time+1, 200))
    ax.set_xticks(np.arange(from_time, until_time+1,  50), minor=True)
    ax.set_xlabel("Simulation time [ns]")
    ax.set_yticks(np.arange(-.2, 1.1, .2))
    ax.set_ylabel("Pearson's $r$")
    
    # SECOND AXIS
    ax = axs[1]
    ax.axis("off") #ax.grid(True, "major", "both", color="k", linestyle=":", linewidth=.5, alpha=.5)
    final_values = get_final_corref(data).flatten()
    kde = gaussian_kde(final_values)
    yvals = np.linspace(np.min(final_values), np.max(final_values), 100)
    xvals = kde(yvals)
    ax.fill_betweenx(yvals, 0, xvals, linewidth=.8, color="#3232aa", alpha=.3)
    ax.plot(xvals, yvals, linewidth=.8, color="#3232aa")
    # for yvals, lstyle, lwidth in zip(quantiles, *five_props):
    #     ax.axhline(yvals[-1], linewidth=lwidth, color="k", linestyle=lstyle, zorder=10)
    
    return fig, axs

In [None]:
def format_peptides(peptide_name):
    out = []
    gen = (c for c in peptide_name)
    for c in gen:
        if c == "g":
            out.append("<small>G</small>")
        else:
            c += next(gen) + next(gen)
            out.append(
                f"<b>{c.capitalize()}</b>")
    return "".join(out)

data, nfrm, names = read_replica_correff(sorted(glob.glob("../*peptides/gg*gg/analy/replica_correff8.dat")))
final_values = get_final_corref(data)

df = pd.DataFrame({
    "Peptide": map(format_peptides, names),
    "rreplica(1,2)" : final_values[:,0], 
    "rreplica(1,3)" : final_values[:,1],
    "rreplica(1,4)" : final_values[:,2],
    "rreplica(2,3)" : final_values[:,3],
    "rreplica(2,4)" : final_values[:,4],
    "rreplica(3,4)" : final_values[:,5],
    "Median": np.median(final_values, axis=1),
})
df.to_csv("rreplica_table8.tsv", float_format="%.3f", sep="\t")

In [None]:
# Axes Peptide, n_frames, replicas
from_time, until_time = 0, 1109
data5, nfrm5, names5 = read_replica_correff(sorted(glob.glob("../5peptides/gg*gg/analy/replica_correff8.dat")))
nfrm5 *= .02

fig, axs = plot_correff_timeseries(data5, nfrm5, names5, from_time=from_time, until_time=until_time)
#fig.savefig(f"figures/correff_5_{from_time:.0f}-{until_time:.0f}.png", dpi=300, bbox_inches="tight")

# 6 peptides

In [None]:
# Axes Peptide, n_frames, replicas
from_time, until_time = 0, 909.
data6, nfrm6, names6 = read_replica_correff(sorted(glob.glob("../6peptides/gg*gg/analy/replica_correff8.dat")))
nfrm6 *= .02

fig, axs = plot_correff_timeseries(data6, nfrm6, names6, from_time=from_time, until_time=until_time, plot_alpha=.25)
#fig.savefig(f"figures/correff_6_{from_time:.0f}-{until_time:.0f}.png", dpi=300, bbox_inches="tight")

# 7 peptides

In [None]:
# Axes Peptide, n_frames, replicas
from_time, until_time  = 0., 909.
data7, nfrm7, names7 = read_replica_correff(sorted(glob.glob("../7peptides/gg*gg/analy/replica_correff8.dat")))
nfrm7 *= .02

fig, axs = plot_correff_timeseries(data7, nfrm7, names7, from_time=from_time, until_time=until_time, plot_alpha=.25)
fig.savefig(f"figures/correff_7_{from_time:.0f}-{until_time:.0f}.png", dpi=300, bbox_inches="tight")

# 8 peptides

In [None]:
# Axes Peptide, n_frames, replicas
from_time, until_time = 0., 909.
data8, nfrm8, names8 = read_replica_correff(sorted(glob.glob("../8peptides/gg*gg/analy/replica_correff8.dat")))
nfrm8 *= .02

fig, axs = plot_correff_timeseries(data8, nfrm8, names8, from_time=from_time, until_time=until_time, plot_alpha=.25)
fig.savefig(f"figures/correff_8_{from_time:.0f}-{until_time:.0f}.png", dpi=300, bbox_inches="tight")

# 9 peptides

In [None]:
# Axes Peptide, n_frames, replicas
from_time, until_time = 0., 709.
data9, nfrm9, names9 = read_replica_correff(sorted(glob.glob("../9peptides/gg*gg/analy/replica_correff8.dat")))
nfrm9 *= .02

fig, axs = plot_correff_timeseries(data9, nfrm9, names9, from_time=from_time, until_time=until_time, plot_alpha=.25)
fig.savefig(f"figures/correff_9_{from_time:.0f}-{until_time:.0f}.png", dpi=300, bbox_inches="tight")

# All peptides

In [None]:
# vub blue: #00339F
properties = {
    "boxprops" : {
        "color": "k"},
    "medianprops": {
        "color": "#FF6600"},
    "whiskerprops": {
        "color": "k"},
    "capprops": {
        "color": "k"},
    "flierprops": {
        "marker": "_",
        "markersize": 2,
        "markeredgecolor": "k",}
        
}

box_data = [
    get_final_corref(data).flatten() for data in (data5, data6, data7, data8, data9)
]

fig, ax = plt.subplots(1,1, figsize=(3,3), dpi=96)
ax.grid(True, "major", "y", color="k", linewidth=.5, linestyle="--", alpha=.3)
ax.boxplot(box_data, **properties)
ax.set_xticklabels([5,6,7,8, 9])
ax.set_xlabel("Peptide length")
ax.set_ylabel("Final correlation coefficient")
ax.plot()

fig.savefig(f"figures/correff_boxplot.png", dpi=300, bbox_inches="tight")