In [None]:
import glob
import os
import random
import re
import matplotlib as mpl
import matplotlib.pyplot as plt
from matplotlib import cm
from matplotlib.patches import FancyBboxPatch
import numpy as np
import pandas as pd

%matplotlib inline

AMINO_ACID_COLORS = {
    "ALA": "#c8c8c8", "LEU": "#0f820f", "ASH": "#e60a0a",
    "ARG": "#145aff", "LYS": "#145aff", "GLH": "#e60a0a",
    "ASN": "#00dcdc", "MET": "#e6e600", "HIP": "#145aff",
    "ASP": "#e60a0a", "PHE": "#3232aa", "SEP": "#fa9600",
    "CYS": "#e6e600", "PRO": "#c8c8c8", "S1P": "#fa9600",
    "GLN": "#00dcdc", "SER": "#00dcdc", "TPO": "#fa9600",
    "GLU": "#e60a0a", "THR": "#00dcdc", "T1P": "#fa9600",
    "GLY": "#c8c8c8", "TRP": "#b45ab4", "PTR": "#fa9600",
    "HIS": "#8282d2", "TYR": "#3232aa", "Y1P": "#fa9600",
    "ILE": "#0f820f", "VAL": "#0f820f"
}

def suppressPandasChainAssignmentWarning(func):
    def _inner_(*args, **kwargs):
        with pd.option_context("mode.chained_assignment", None):
            result = func(*args, **kwargs)
        return result
    return _inner_

@suppressPandasChainAssignmentWarning
def read_constava(*input_files, filter_by_method: str = None):
    _data = pd.DataFrame()
    for fpath in (os.path.abspath(fp) for fp in input_files):
        df = pd.read_csv(fpath, header=0)
        if filter_by_method is not None:
            df = df.loc[df["Method"] == filter_by_method]
        # Assign sequence
        df["sequence"] = "-".join(df["ResName"])
        # Concatenate to remaining data
        _data = pd.concat([_data, df], ignore_index=True)
    return _data
        
def beeswarm(ax, x: float, y: np.ndarray, /, *, c=None, binsize=.025, offset_func=lambda n: .1 * np.log(n), **scatter_kwargs):
    ymin, ymax = np.min(y), np.max(y)
    ydelta = ymax - ymin
    bin_edges = np.linspace(ymin - .01*binsize, ymax +.01*binsize, max(2, int(np.ceil((ymax - ymin) / binsize))))

    bin_indices = np.digitize(y, bin_edges)
    for i in range(1, len(bin_edges)):
        msk = (bin_indices == i)
        yvals = y[msk]
        yn = yvals.size
        if yn:
            xoffset = offset_func(yn)
            xvals = np.linspace(x-xoffset, x+xoffset, yn)
            ax.scatter(xvals, yvals, c=(c if c is None else c[msk]), **scatter_kwargs)
    return ymin, ymax


def amino_beeswarm(dataiterator, ax=None, title=None, xlabel=None, ylabel=None, grid=True, **kwargs):
    if ax is None:
        fig, ax = plt.subplots(1,1, figsize=(8, 3), dpi=96) 
    if grid:
        ax.grid(True, "major", "y", color="0.6", linewidth=.3)
        
    labels = []
    bee_kwargs = dict(edgecolors="k", linewidth=.5, alpha=1, s=30)
    bee_kwargs.update(kwargs)
    for x, (lbl, data, colors) in enumerate(dataiterator):
        labels.append(lbl)
        beeswarm(ax, x, data, c=colors, **bee_kwargs)
        
    ax.set_xticks(np.arange(0, x+1))
    ax.set_xticklabels(labels, rotation=-90)
    ax.set_xlim([-1, x+1])
    if title is not None:
        ax.set_title(title)
    if xlabel is not None:
        ax.set_xlabel(xlabel)
    if ylabel is not None:
        ax.set_ylabel(ylabel)
    return ax

In [None]:
method="window(3)"
df5 = read_constava(*glob.glob("../5peptides/gg*gg/analy/constava.csv"), filter_by_method=method)
df6 = read_constava(*glob.glob("../6peptides/gg*gg/analy/constava.csv"), filter_by_method=method)
df7 = read_constava(*glob.glob("../7peptides/gg*gg/analy/constava.csv"), filter_by_method=method)
df8 = read_constava(*glob.glob("../8peptides/gg*gg/analy/constava.csv"), filter_by_method=method)
df9 = read_constava(*glob.glob("../9peptides/gg*gg/analy/constava.csv"), filter_by_method=method)
 
residues = [
    'ALA', 'ARG', 'ASN', 'ASP', 'ASH', 
    'CYS', 'GLN', 'GLU', 'GLH', 'GLY',
    'HIS', 'HIP', 'ILE', 'LEU', 'LYS',
    'MET', 'PHE', 'PRO', 'TRP', 'VAL',
    'SER', 'S1P', 'SEP',
    'THR', 'T1P', 'TPO',
    'TYR', 'Y1P', 'PTR'
]

attributes = ["ConStaVa", "coreHelix", "surrHelix", "coreSheet", "surrSheet", "Turn", "Other"]
df9.head()

# Dynamics

In [None]:
attributes

In [None]:
new_params = {
    "font.size": 8.,
}
old_params = {k: mpl.rcParams[k] for k in new_params.keys()}
mpl.rcParams.update(new_params)


fig, axs = plt.subplots(3,2, figsize=(7,5.5), dpi=96,
                        gridspec_kw={"left":0, "right":1, "bottom":0, "top":1, "hspace": .3})

for ax, (key, lbl) in zip(axs.flatten(order="F"), [
    ("coreHelix", "$\it{Core \, helix}$ propensity"), ("surrHelix", "$\it{Surr. \, helix}$ propensity"),
    ("Turn", "$\it{Turn}$ propensity"), ("coreSheet", "$\it{Core \, sheet}$ propensity"),
    ("surrSheet", "$\it{Surr. \, sheet}$ propensity"), ("Other", "$\it{Other}$ propensity")]):

    data, xlabels = [], []
    for i,resn in enumerate(residues):
        values = np.zeros((0,))
        xlabels.append(resn)
        for peptlength, df in enumerate([df5, df6, df7, df8, df9], start=5):
            values = np.concatenate([
                values,
                df.loc[(df["ResName"] == resn) & (df["#ResIndex"] == 4), [key]].to_numpy().flatten(),
                df.loc[(df["ResName"] == resn) & (df["#ResIndex"] == i-1), [key]].to_numpy().flatten()])
        data.append(values)
    
    ax.grid(True, "major", "y", color="grey", alpha=.5, zorder=1)
    ax.add_patch(
        FancyBboxPatch((20.54, 0.), 2.92, .67, mutation_aspect=.1, boxstyle='Round, pad=0, rounding_size=.5',
                       linestyle="none", facecolor="C1", alpha=.2))
    ax.add_patch(
        FancyBboxPatch((23.54, 0.), 2.92, .67, mutation_aspect=.1, boxstyle='Round, pad=0, rounding_size=.5',
                       linestyle="none", facecolor="C2", alpha=.2))
    ax.add_patch(
        FancyBboxPatch((26.54, 0.), 2.92, .67, mutation_aspect=.1, boxstyle='Round, pad=0, rounding_size=.5',
                       linestyle="none", facecolor="C4", alpha=.2))
    
    
    violins = ax.violinplot(data, bw_method=.2, widths=.8, showmeans=False, showmedians=False, showextrema=False)
    for i, (vln, resn) in enumerate(zip(violins['bodies'], xlabels)):
        if i < 20:
            vln.set_facecolor(cm.BuPu(i / 19.))
        else:
            vln.set_facecolor(cm.BuPu(.5 * ((i+1) % 3)))
        vln.set_edgecolor('black')
        vln.set_linewidth(.7)
        vln.set_alpha(.8)
    
    ax.set_xticks(np.arange(1,30))
    ax.set_xlim([0,30])
    ax.set_xticklabels(xlabels, rotation=-60, size=7)
    ax.set_ylim([-.01, .72])
    ax.set_ylabel(lbl)

fig.savefig(f"figures/constava_300dpi_win3.pdf", dpi=300, bbox_inches="tight")

mpl.rcParams.update(old_params)

In [None]:
for attr in attributes:
    ylabel = attr

    fig, axs = plt.subplots(1,2, figsize=(12, 3), dpi=96, sharey=True, gridspec_kw={"hspace": .5, "wspace": .05})

    for i, df in enumerate([df5, None, None, df8, df9], start=5):
        if df is None: continue
        ldata = []
        rdata = []
        for resn in residues:
            dl = df.loc[(df["ResName"] == resn) & (df["#ResIndex"] == 4), [attr, "sequence"]]
            cl = np.array([AMINO_ACID_COLORS[seq.split("-")[-3]] for seq in dl["sequence"]])
            ldata.append((resn, (dl[attr]).to_numpy(), cl))
            dr = df.loc[(df["ResName"] == resn) & (df["#ResIndex"] == i-1), [attr, "sequence"]]
            cr = np.array([AMINO_ACID_COLORS[seq.split("-")[2]] for seq in dr["sequence"]])
            rdata.append((resn, (dr[attr]).to_numpy(), cr))

        amino_beeswarm(ldata, ax=axs[0], title=f"{ylabel}", ylabel=ylabel, edgecolors="w", linewidth=.1, alpha=1, s=8)
        amino_beeswarm(rdata, ax=axs[1], title=f"{ylabel}", ylabel=None, edgecolors="w", linewidth=.1, alpha=1, s=8)
    
    fig.savefig(f"figures/{attr}_300dpi_win3.pdf", dpi=300, bbox_inches="tight")

In [None]:
attr = "ConStaVa"
ylabel = "Conf. State Variability"

ylabel = attr

fig, ax = plt.subplots(1, figsize=(6, 2), dpi=96, sharey=True, gridspec_kw={"left":0, "right":1, "bottom":0, "top":1})

for i, df in enumerate([df6, df7, df8, df9], start=6):
    ldata = []
    #rdata = []
    for resn in residues:
        dl = pd.concat([
            df.loc[(df["ResName"] == resn) & (df["#ResIndex"] == 4), [attr, "sequence"]],
            df.loc[(df["ResName"] == resn) & (df["#ResIndex"] == i-1), [attr, "sequence"]]
        ])
        cl = np.array([AMINO_ACID_COLORS[seq.split("-")[-3]] for seq in dl["sequence"]])
        sort_mask = np.arange(cl.shape[0])
        np.random.shuffle(sort_mask)
        ldata.append((resn, (dl[attr]).to_numpy()[sort_mask], cl[sort_mask]))
        #cr = np.array([AMINO_ACID_COLORS[seq.split("-")[2]] for seq in dr["sequence"]])
        #rdata.append((resn, (dr[attr]).to_numpy(), cr))

    amino_beeswarm(ldata, ax=ax, title=f"{ylabel}", ylabel=ylabel, edgecolors="w", linewidth=.1, alpha=1, s=8)
    #amino_beeswarm(rdata, ax=axs[1], title=f"{ylabel}", ylabel=None, edgecolors="w", linewidth=.1, alpha=1, s=8)

fig.savefig(f"figures/dynamics/{attr}_600dpi_win3.png", dpi=600, bbox_inches="tight")