In [None]:
import glob, os, re, sys
from typing import Tuple, Union
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import tqdm
from bio2byte.mdutils.utils.BmrbComparison import BmrbComparisonReader, MdDataParser

%matplotlib inline
pd.set_option('display.max_rows', 1000)
# pd.set_option('display.max_columns', 50)

# Parse BMRB data

In [None]:
def aggregate_by_sequence(df):
    first = lambda x: x.iloc[0]
    count_uniq = lambda x: x.drop_duplicates().size
    mean  = np.nanmean
    std   = np.nanstd

    gb = df.groupby(["seq", "idx"])
    dfJoin = gb.aggregate({
        'apo.resn': first, 
        'mod.resn': first, 
        'apo.bmrbId': count_uniq, 
        'mod.bmrbId': count_uniq})
    dfMean = gb.aggregate({
        'delta.rci': np.nanmean,
        'delta.helix': np.nanmean,
        'delta.beta': np.nanmean,
        'delta.coil': np.nanmean,
        'delta.ppII': np.nanmean})
    dfStd = gb.aggregate({
        'delta.rci': np.nanstd,
        'delta.helix': np.nanstd,
        'delta.beta': np.nanstd,
        'delta.coil': np.nanstd,
        'delta.ppII': np.nanstd})
    return pd.concat([dfJoin, dfMean, dfStd], axis=1, keys=["","mean", "std"])

dfs, keys = [], []
for apo, mod in [("SER", "SEP"), ("THR", "TPO"), ("TYR", "PTR")]:
    reader = BmrbComparisonReader(apo, mod, identity_margin=(6,6))
    for infile in tqdm.tqdm(glob.glob("bmrb_refdata/*.csv")):
        reader.read_file(infile)
    dfs.append(aggregate_by_sequence(reader.get_delta_dataframe()))
    keys.append(mod)
    
dfBMRB = pd.concat(dfs, keys=keys).swaplevel(axis=1)
del reader, dfs, keys

In [None]:
reader = BmrbComparisonReader(".*", "SEP", identity_margin=(6,6))
for infile in tqdm.tqdm(glob.glob("bmrb_refdata/*.csv")):
    reader.read_file(infile)
reader.get_delta_dataframe()["mod.bmrbId"].drop_duplicates()

# Parse MD data

In [None]:
mddirs = [d for i in [5,6,7,8,9] for d in glob.glob(f"../{i}peptides/gg*")]

dfs, keys = [], []
for apo, mod in [("SER", "SEP"), ("THR", "TPO"), ("TYR", "PTR")]:
    readMD = MdDataParser(apo, mod)
    for apodir, moddir in tqdm.tqdm(readMD.pair_apo_mod(mddirs)):
        readMD.read_dirs(apodir, moddir, method="window(5)")
    df = readMD.get_delta_df()
    df.set_index(["segid", "idx"], inplace=True)
    dfs.append(df)
    keys.append(mod)
    
dfMD = pd.concat(dfs, keys=keys)
del readMD

# Plot BMRBdata

In [None]:
def convolve1d(arr, kernel):
    return np.convolve(arr, kernel, mode="same")

def get_derivative(xarr, yarr):
    idx = np.argsort(xarr)
    xarr, yarr = xarr[idx], yarr[idx]
    xarr = xarr[1:] - np.diff(xarr)
    yarr = np.diff(yarr)
    return xarr, yarr

def plotBmrb(df, colname: Union[str,Tuple], ax, *, convolve=None, derivative=False, factor=None, **kwargs):
    gb = df.xs(colname, axis=1).reset_index(level="idx").groupby("seq")
    for data in (gb.get_group(g) for g in gb.groups):
        xvals, yvals, ystd = np.array(data["idx"]), np.array(data["mean"]), np.array(data["std"])
        if factor is not None:
            yvals *= factor
        if convolve is not None:
            yvals = convolve1d(yvals, kernel=convolve)
        if derivative:
            xvals, yvals = get_derivative(xvals, yvals)
        ax.errorbar(xvals, yvals, yerr=ystd, **kwargs)
        #ax.plot(xvals, yvals, **kwargs)

def plotMD(df, colname: Union[str,Tuple], ax, *, convolve=None, derivative=False, factor=None, **kwargs):
    gb = df.xs(colname, axis=1).reset_index(level="idx").groupby("segid")
    for data in (gb.get_group(g) for g in gb.groups):
        xvals, yvals = np.array(data["idx"]), np.array(data[colname])
        if factor is not None:
            yvals *= factor
        if convolve is not None:
            yvals = convolve1d(yvals, kernel=convolve)
        if derivative:
            xvals, yvals = get_derivative(xvals, yvals)
        ax.plot(xvals, yvals, **kwargs)
        
def format_axes(axs, ymin=-1., ymax=1.):
    for i, (lbl, ax) in enumerate(zip("ABCDEFGHIJKL", axs.flatten())):
        # Grid settings
        ax.set_facecolor("0.95")
        ax.grid(True, "major", "both", color="w", linewidth=1, zorder=1)
        ax.axvline(0, color="w", linewidth=10., alpha=.7)
        ax.spines['right'].set_color('none')
        ax.spines['top'].set_color('none')
        ax.spines['bottom'].set_position('zero')
        # Tick settings
        ax.set_ylim([ymin, ymax])
        ax.set_yticks(np.arange(ymin, ymax + .01, .2))
        ax.set_xlim([-5.5, 5.5])
        ax.set_xticks(np.arange(-5, 5.1, 1))
        ax.set_xticklabels([])
        ax.tick_params(labelsize=9)
        # Panel label
        if i % 3 == 0:
            ax.text(-.3, 1, lbl, size=12, transform=ax.transAxes)
        else:
            ax.text(-.08, 1, lbl, size=12, transform=ax.transAxes)
        if i > 8:
            ax.set_xlabel("Sequence index", size=10)
            
MD_FORMAT = dict(
    convolve = [.25, .5, .25],
    color="0.3", 
    linewidth=.8, 
    marker="o",
    markersize=1.3,
    alpha=.1, 
    zorder=10)

BMRB_FORMAT = dict(
    convolve = None,
    color="violet",
    linewidth=.8,
    marker="d",
    markersize=2.3,
    markerfacecolor="violet",
    markeredgecolor="purple",
    markeredgewidth=.3,
    alpha=.4, 
    zorder=20)

#### Compare absolute values

In [None]:
ATTRIBUTES_NMR = ['delta.rci', 'delta.helix', 'delta.beta', 'delta.coil']
ATTRIBUTES_MD = ['d.CircVar', 'd.helix', 'd.beta', 'd.Other']

fig, axs = plt.subplots(4,3, figsize=(9,7), dpi=150, sharex=True, sharey=True, 
                        gridspec_kw = {"hspace": .2, "wspace": .1})
fig.set_facecolor("w")

for i, mod in enumerate(["SEP", "TPO", "PTR"]):
    bmrdata = dfBMRB.xs(mod)
    mddata = dfMD.xs(mod)
    for ax, attr in zip(axs[:,i], ATTRIBUTES_MD):
        plotMD(mddata, attr, ax, **MD_FORMAT, factor=(None if attr != 'd.CircVar' else -1))
    for ax, attr in zip(axs[:,i], ATTRIBUTES_NMR):
        plotBmrb(bmrdata, attr, ax, **BMRB_FORMAT)

# Format the axes
format_axes(axs, -.6, .6)
fig.savefig("figures/bmrb_comparison_absolute.pdf", dpi=300, bbox_inches="tight")

#### Compare derivatives values

In [None]:
ATTRIBUTES_NMR = ['delta.rci', 'delta.helix', 'delta.beta', 'delta.coil']
ATTRIBUTES_MD = ['d.CircVar', 'd.helix', 'd.beta', 'd.Other']

fig, axs = plt.subplots(4,3, figsize=(9,7), dpi=150, sharex=True, sharey=True, 
                        gridspec_kw = {"hspace": .2, "wspace": .1})
fig.set_facecolor("w")

for i, mod in enumerate(["SEP", "TPO", "PTR"]):
    bmrdata = dfBMRB.xs(mod)
    mddata = dfMD.xs(mod)
    for ax, attr in zip(axs[:,i], ATTRIBUTES_MD):
        plotMD(mddata, attr, ax, derivative=True, **MD_FORMAT, factor=(None if attr != 'd.CircVar' else -1))
    for ax, attr in zip(axs[:,i], ATTRIBUTES_NMR):
        plotBmrb(bmrdata, attr, ax, derivative=True, **BMRB_FORMAT)

# Format the axes
format_axes(axs, -., .4)
fig.savefig("figures/bmrb_comparison_derivative.pdf", dpi=300, bbox_inches="tight")

# BMRB / MD comparison

### Compare absolute values

In [None]:
def compare_plot(ax, df1, xcol1, ycol1, df2, xcol2, ycol2, f1=1., f2=1.):
    
    KERNEL = np.array([.25, .50, .25])
    
    ax.grid(True, "major", "y", color="grey", linewidth=.3, alpha=.7, zorder=1)
    
    # Plot MD data
    gb1 = df1[["segid", xcol1, ycol1]].groupby("segid")
    X, Y = [], []
    for xvals, yvals in (gb1.get_group(g).iloc[:,1:].values.T for g in gb1.groups):
        yvals *= f1 
        yvals = np.convolve(yvals, KERNEL, mode="same")
        X.append(xvals); X.append([np.nan])
        Y.append(yvals); Y.append([np.nan])
        ax.plot(xvals, yvals, color="lightsteelblue", linewidth=1, alpha=.05, zorder=1)
    xvals, yvals = np.concatenate(X, axis=0), np.concatenate(Y, axis=0)
    # ax.plot(xvals, yvals, color="lightsteelblue", linewidth=1, alpha=.3, zorder=1)
    ax.scatter(xvals, yvals, s=2, marker="o", alpha=.3, zorder=3, color="lightsteelblue", edgecolors="steelblue", linewidth=.3)        
    
    # Plot BMRB data
    gb2 = df2[["segid", xcol2, ycol2]].groupby("segid")
    X, Y = [], []
    n_entries = 0
    for xvals, yvals in (gb2.get_group(g).iloc[:,1:].values.T for g in gb2.groups):
        if np.isnan(yvals).any():
            continue
        yvals *= f2
        X.append(xvals); X.append([np.nan])
        Y.append(yvals); Y.append([np.nan])
        ax.plot(xvals, yvals, color="violet", linewidth=1, alpha=.2, zorder=2)
        n_entries += 1
    xvals, yvals = np.concatenate(X, axis=0), np.concatenate(Y, axis=0)
    # ax.plot(xvals, yvals, color="violet", linewidth=1, alpha=.3, zorder=2)
    ax.scatter(xvals, yvals, s=2, marker="D", alpha=.4, zorder=5, color="violet", edgecolors="purple", linewidth=.3)        
        
    # Plot N samples
    ax.text(.98,.95, f"N = {n_entries}", va="top", ha="right", transform=ax.transAxes)

In [None]:
ATTRIBUTES = [("d.rci", "d.CircVar", 1., -1.), ("d.helix",)*2, ("d.beta",)*2, ("d.coil",)*2]
YLABELS = ["$\Delta S^{2}_{RCI} \; | \; -\Delta CV$", "$\Delta P(X = helix)$", "$\Delta P(X = sheet)$", "$\Delta P(X = coil)$"]

fig, axs = plt.subplots(4,3, figsize=(9,7), dpi=150, sharex=True, sharey=True, 
                        gridspec_kw = {"hspace": .2, "wspace": .1})
fig.set_facecolor("w")
#
# ::: Plot SER/SEP
#
MODRES = "SEP"
for ax, (attrBMRB, attrMD, *factors), lbl in  zip(axs[:,0], ATTRIBUTES, YLABELS):
    f1, f2 = factors or (1., 1.)
    # Plot Graphics
    compare_plot(ax, mdDFs[MODRES], "idx", attrMD, bmrbDFs[MODRES], "idx", attrBMRB, f2, f1)  
    ax.set_ylabel(lbl, size=10)
#
# ::: Plot THR/TPO
#
MODRES = "TPO"
for ax, (attrBMRB, attrMD, *factors) in  zip(axs[:,1], ATTRIBUTES):
    f1, f2 = factors or (1., 1.)
    # Plot Graphics
    compare_plot(ax, mdDFs[MODRES], "idx", attrMD, bmrbDFs[MODRES], "idx", attrBMRB, f2, f1)     
# #
# # ::: Plot TYR/PTR
# #
MODRES = "PTR"
for ax, (attrBMRB, attrMD, *factors) in  zip(axs[:,2], ATTRIBUTES):
    f1, f2 = factors or (1., 1.)
    # Plot Graphics
    compare_plot(ax, mdDFs[MODRES], "idx", attrMD, bmrbDFs[MODRES], "idx", attrBMRB, f2, f1)
    
ax.set_ylim([-.6, .6])
ax.set_yticks(np.arange(-.6, .61, .2))
ax.set_xlim([-5.5, 5.5])
ax.set_xticks(np.arange(-5, 5.1, 1))

axs[0,0].set_title("Serine")
axs[0,1].set_title("Threonine")
axs[0,2].set_title("Tyrosine")

for i, (lbl, ax) in enumerate(zip("ABCDEFGHIJKL", axs.flatten())):
    ax.tick_params(labelsize=9)
    if i % 3 == 0:
        ax.text(-.3, 1, lbl, size=12, transform=ax.transAxes)
    else:
        ax.text(-.08, 1, lbl, size=12, transform=ax.transAxes)
    if i > 8:
        ax.set_xlabel("Sequence index", size=10)

fig.savefig("figures/bmrb_comparison_window.png", dpi=300, bbox_inches="tight")

### Compare first derivatives

In [None]:
def compare_derivatives(ax, df1, xcol1, ycol1, df2, xcol2, ycol2, f1=1., f2=1.):
    
    KERNEL = np.array([.25, .50, .25])
    
    ax.grid(True, "major", "y", color="grey", linewidth=.3, alpha=.7, zorder=1)
    
    
    # Plot MD data
    gb1 = df1[["segid", xcol1, ycol1]].groupby("segid")
    X, Y = [], []
    for xvals, yvals in (gb1.get_group(g).iloc[:,1:].values.T for g in gb1.groups):
        yvals *= f1 
        yvals = np.convolve(yvals, KERNEL, mode="same")
        xvals, yvals = .5 * (xvals[:-1] + xvals[1:]), np.diff(yvals)
        X.append(xvals); X.append([np.nan])
        Y.append(yvals); Y.append([np.nan])
        ax.plot(xvals, yvals, color="lightsteelblue", linewidth=1, alpha=.05, zorder=1)
    xvals, yvals = np.concatenate(X, axis=0), np.concatenate(Y, axis=0)
    # ax.plot(xvals, yvals, color="lightsteelblue", linewidth=1, alpha=.3, zorder=1)
    ax.scatter(xvals, yvals, s=2, marker="o", alpha=.3, zorder=3, color="lightsteelblue", edgecolors="steelblue", linewidth=.3)        
    
    # Plot BMRB data
    gb2 = df2[["segid", xcol2, ycol2]].groupby("segid")
    X, Y = [], []
    n_entries = 0
    for xvals, yvals in (gb2.get_group(g).iloc[:,1:].values.T for g in gb2.groups):
        if np.isnan(yvals).any():
            continue
        yvals *= f2
        xvals, yvals = .5 * (xvals[:-1] + xvals[1:]), np.diff(yvals)
        X.append(xvals); X.append([np.nan])
        Y.append(yvals); Y.append([np.nan])
        ax.plot(xvals, yvals, color="violet", linewidth=1, alpha=.2, zorder=2)
        n_entries += 1
    xvals, yvals = np.concatenate(X, axis=0), np.concatenate(Y, axis=0)
    # ax.plot(xvals, yvals, color="violet", linewidth=1, alpha=.3, zorder=2)
    ax.scatter(xvals, yvals, s=2, marker="D", alpha=.4, zorder=5, color="violet", edgecolors="purple", linewidth=.3)        
           
    # Plot N samples
    n_entries = df2.loc[~df2[ycol2].isna(), ['apo.bmrbId', 'mod.bmrbId']].drop_duplicates().shape[0]
    ax.text(.98,.95, f"N = {n_entries}", va="top", ha="right", transform=ax.transAxes)

In [None]:
ATTRIBUTES = [("d.rci", "d.CircVar", 1., -1.), ("d.helix",)*2, ("d.beta",)*2, ("d.coil",)*2]
YLABELS = ["$\Delta S^{2}_{RCI} \; | \; -\Delta CV$", "$\Delta P(X = helix)$", "$\Delta P(X = sheet)$", "$\Delta P(X = coil)$"]
YLABELS = [ylbl + r" $\frac{dy}{dx}$" for ylbl in YLABELS]

fig, axs = plt.subplots(4,3, figsize=(9,7), dpi=150, sharex=True, sharey=True, 
                        gridspec_kw = {"hspace": .2, "wspace": .1})
fig.set_facecolor("w")
#
# ::: Plot SER/SEP
#
MODRES = "SEP"
for ax, (attrBMRB, attrMD, *factors), lbl in  zip(axs[:,0], ATTRIBUTES, YLABELS):
    f1, f2 = factors or (1., 1.)
    # Plot Graphics
    compare_derivatives(ax, mdDFs[MODRES], "idx", attrMD, bmrbDFs[MODRES], "idx", attrBMRB, f2, f1)  
    ax.set_ylabel(lbl)
#
# ::: Plot THR/TPO
#
MODRES = "TPO"
for ax, (attrBMRB, attrMD, *factors) in  zip(axs[:,1], ATTRIBUTES):
    f1, f2 = factors or (1., 1.)
    # Plot Graphics
    compare_derivatives(ax, mdDFs[MODRES], "idx", attrMD, bmrbDFs[MODRES], "idx", attrBMRB, f2, f1)     
#
# ::: Plot TYR/PTR
#
MODRES = "PTR"
for ax, (attrBMRB, attrMD, *factors) in  zip(axs[:,2], ATTRIBUTES):
    f1, f2 = factors or (1., 1.)
    # Plot Graphics
    compare_derivatives(ax, mdDFs[MODRES], "idx", attrMD, bmrbDFs[MODRES], "idx", attrBMRB, f2, f1)
    
ax.set_ylim([-.6, .6])
ax.set_yticks(np.arange(-.6, .61, .2))
ax.set_xlim([-5.5, 5.5])
ax.set_xticks(np.arange(-5, 5.1, 1))

axs[0,0].set_title("Serine")
axs[0,1].set_title("Threonine")
axs[0,2].set_title("Tyrosine")

for i, (lbl, ax) in enumerate(zip("ABCDEFGHIJKL", axs.flatten())):
    ax.tick_params(labelsize=9)
    if i % 3 == 0:
        ax.text(-.3, 1, lbl, size=12, transform=ax.transAxes)
    else:
        ax.text(-.08, 1, lbl, size=12, transform=ax.transAxes)
    if i > 8:
        ax.set_xlabel("Sequence index", size=10)

fig.savefig("figures/bmrb_comparison_derivative_window.png", dpi=300, bbox_inches="tight")