In [None]:
import pandas as pd
import os
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches

In [None]:
# Input
dataset_raw_file = "../data/output/dataset_raw.tsv"
datase_file = "../data/output/dataset.tsv"
references_dir = "../data/output/references"
ncbi_tax_file = "../data/new_taxdump/rankedlineage.dmp"

# Output
fig_dir = "../data/output/references_stat"

In [None]:
df_dataset = pd.read_csv(dataset_raw_file, sep="\t")
df_dataset.columns

In [None]:
df_dataset

In [None]:
tuples = []
for reference_file in os.listdir(references_dir):
    with open("{}/{}".format(references_dir, reference_file)) as f:
        data = {}
        reference_name = reference_file.split(".")[0]
        group = None
        for k in ['linker', 'binding', 'disorder']:  # order matters
            if k in reference_name:
                group = k
                break
        for line in f:
            if line:
                if line[0] == ">":
                    name = line.strip()[1:]
                    data.setdefault(name, [])
                else:
                    data[name].append(list(line.strip()))
        for name in data:
            for i, (aa, val) in enumerate(zip(*data[name])):
                tuples.append((group, reference_name, name, i + 1, aa, val))
df_references = pd.DataFrame(tuples, columns=['group', 'reference', 'disprot_id', 'pos', 'aa', 'value'])
df_references.set_index(['group', 'reference', 'disprot_id', 'pos', 'aa'], inplace=True)
df_references

# Number of proteins

In [None]:
plt.rcParams.update({'font.size': 22, 'grid.linestyle': '-', 'grid.color': 'gray' })

for group, df_g in df_references.groupby(level='group'):
    df2 = df_g.reset_index(level='disprot_id').groupby(['reference'])['disprot_id'].nunique().reset_index().set_index('reference')
    print(df2)

    # fig, ax = plt.subplots(figsize=(5, 5))
    # df2.plot(kind='barh', width=0.5, ax=ax)
    # ax.xaxis.grid()
    # ax.set_ybound(-1, 4)
    # ax.get_legend().remove()
    # ax.set_xlabel('Proteins', labelpad=30)
    #
    # fig.gca().set_ylabel("")
    # fig.savefig("{}/stat_proteins_{}.png".format(fig_dir, group), dpi=300, bbox_inches='tight')

# Class content

In [None]:
# Paper figure
plt.rcParams.update({'font.size': 42, 'savefig.transparent': True})
challenges = ['disorder_nox', 'disorder_pdb', 'binding', 'linker']
colors = {'0': 'white', '1': 'black'}

df2 = df_references.reset_index()
df2 = df2.loc[df2['reference'].isin(challenges), ['reference', 'value']].value_counts().unstack('value').fillna(0).sort_values(by=['reference'], ascending=[False]).drop(columns=['-']).sort_index(level='reference', key=lambda x: x.map(
    {challenge: i for i, challenge in enumerate(challenges)}), ascending=False)
df2['pos_ratio'] = df2['1'] * 100 / (df2['1'] + df2['0'])
print(df2)

fig, ax = plt.subplots(figsize=(12, 6))
axes = df2.plot(kind='barh', stacked=True, ax=ax, edgecolor='black', color=[colors['0'], colors['1']], linewidth=3)
ax.xaxis.grid()
ax.set_xlabel('No. residues', labelpad=30)
ax.xaxis.set_major_formatter( matplotlib.ticker.FuncFormatter(lambda x, p: format(int(x), ',')))
ax.tick_params(axis='x', labelrotation=45)

ax.get_legend().remove()
ax.set_xlim(0, 120000)

# Add counts above the two bar graphs
for rect, row in zip(axes.patches[4:], df2.iterrows()):
    print(rect, row[1]['pos_ratio'])
    y = rect.get_y()
    x = rect.get_x()
    width = rect.get_width()
    height = rect.get_height()
    plt.text(x + width + 20000, y, "{:.1f}%".format(row[1]['pos_ratio']), ha='center', va='bottom')

fig.gca().set_ylabel("")
fig.savefig("{}/stat_residues_paper.png".format(fig_dir), dpi=300, bbox_inches='tight')

In [None]:
references = ['disorder_nox', 'disorder_pdb', 'binding', 'linker']
df_g = df_references.reset_index().set_index('reference')
df_g = df_g.loc[references]
df_ = df_g.groupby(['disprot_id', 'reference', 'value']).agg({'pos': 'count'}).reset_index()
df_ = df_.loc[df_['value'] != '-']
df_tot = df_.groupby(['disprot_id', 'reference']).agg({'pos': 'sum'})
df_ = pd.merge(df_, df_tot, on=['disprot_id', 'reference'])
df_['fraction'] = df_['pos_x'] / df_['pos_y']
df_


In [None]:
# Paper figure disorder content
plt.rcParams.update({'font.size': 42, 'savefig.transparent': True})

positions = [2,3,6,7,4,5,0,1]
# colors = {'0': '#1f77b4', '1': '#ff7f0e'}
colors = {'0': 'white', '1': 'black'}

fig, axes = plt.subplots(2,1, figsize=(20, 30))

bp = df_.loc[:, ['reference', 'value', 'pos_x']].boxplot(by=['reference', 'value'], ax=axes[0], vert=False, positions=positions, showfliers=False, return_type='dict', patch_artist=True, whiskerprops=dict(linestyle='-',linewidth=3.0, color='black'), medianprops=dict(linewidth=3.0, color='red'), boxprops=dict(linewidth=3.0, color='black'), capprops=dict(linewidth=3.0))


bp_fraction = df_.loc[:, ['reference', 'value', 'fraction']].boxplot(by=['reference', 'value'], ax=axes[1], vert=False, positions=positions, showfliers=False, return_type='dict', patch_artist=True, whiskerprops=dict(linestyle='-',linewidth=3.0, color='black'), medianprops=dict(linewidth=3.0, color='red'), boxprops=dict(linewidth=3.0, color='black'), capprops=dict(linewidth=3.0))

for ax in axes:
    labels = [item.get_text()[1:-1].split(",")[0] for item in ax.get_yticklabels()]
    ax.set_yticklabels(labels)
    ax.set_ylabel("")
    ax.get_figure().suptitle('')
    ax.set_title("")

axes[1].set_xlabel('Protein content (%)', labelpad=30)
axes[0].set_xlabel('Protein content (no. residues)', labelpad=30)

# Bar colors
for patch, (label, df_g) in zip(bp_fraction['fraction']['boxes'], df_.groupby(['reference', 'value'])):
    patch.set_facecolor(colors[label[1]])
    print(label)
positive = mpatches.Patch(label='Positive', color=colors['1'])
negative = mpatches.Patch(label='Negative', color=colors['0'])

for patch, (label, df_g) in zip(bp['pos_x']['boxes'], df_.groupby(['reference', 'value'])):
    patch.set_facecolor(colors[label[1]])
positive = mpatches.Patch(label='Positive', color=colors['1'])
negative = mpatches.Patch(label='Negative', color=colors['0'])

# Legend
# handles, labels = ax.get_legend_handles_labels()
# fig.legend(handles=[positive, negative], loc='upper center', ncol=2, bbox_to_anchor=(.6, 1.0), frameon=False)
fig.tight_layout(pad=3.0)
fig.savefig("{}/stat_content_paper.png".format(fig_dir), dpi=300, bbox_inches='tight')

In [None]:
# WARNING: just for comparison, to check how the content change when considering the same set of proteins
# Disorder content disorder-nox vs disorder-pdb
plt.rcParams.update({'font.size': 42, 'savefig.transparent': True})

positions = [2,3,6,7,4,5,0,1]
# colors = {'0': '#1f77b4', '1': '#ff7f0e'}
colors = {'0': 'white', '1': 'black'}

fig, axes = plt.subplots(2,1, figsize=(20, 30))

protein_nox = df_.loc[df_['reference']=='disorder_nox']['disprot_id'].unique()

bp = df_.loc[df_['disprot_id'].isin(protein_nox), ['reference', 'value', 'pos_x']].boxplot(by=['reference', 'value'], ax=axes[0], vert=False, positions=positions, showfliers=False, return_type='dict', patch_artist=True, whiskerprops=dict(linestyle='-',linewidth=3.0, color='black'), medianprops=dict(linewidth=3.0, color='red'), boxprops=dict(linewidth=3.0, color='black'), capprops=dict(linewidth=3.0))


bp_fraction = df_.loc[df_['disprot_id'].isin(protein_nox), ['reference', 'value', 'fraction']].boxplot(by=['reference', 'value'], ax=axes[1], vert=False, positions=positions, showfliers=False, return_type='dict', patch_artist=True, whiskerprops=dict(linestyle='-',linewidth=3.0, color='black'), medianprops=dict(linewidth=3.0, color='red'), boxprops=dict(linewidth=3.0, color='black'), capprops=dict(linewidth=3.0))

for ax in axes:
    labels = [item.get_text()[1:-1].split(",")[0] for item in ax.get_yticklabels()]
    ax.set_yticklabels(labels)
    ax.set_ylabel("")
    ax.get_figure().suptitle('')
    ax.set_title("")

axes[1].set_xlabel('Protein content (%)', labelpad=30)
axes[0].set_xlabel('Protein content (no. residues)', labelpad=30)

# Bar colors
for patch, (label, df_g) in zip(bp_fraction['fraction']['boxes'], df_.groupby(['reference', 'value'])):
    patch.set_facecolor(colors[label[1]])
    print(label)
positive = mpatches.Patch(label='Positive', color=colors['1'])
negative = mpatches.Patch(label='Negative', color=colors['0'])

for patch, (label, df_g) in zip(bp['pos_x']['boxes'], df_.groupby(['reference', 'value'])):
    patch.set_facecolor(colors[label[1]])
positive = mpatches.Patch(label='Positive', color=colors['1'])
negative = mpatches.Patch(label='Negative', color=colors['0'])

# Legend
# handles, labels = ax.get_legend_handles_labels()
# fig.legend(handles=[positive, negative], loc='upper center', ncol=2, bbox_to_anchor=(.6, 1.0), frameon=False)
fig.tight_layout(pad=3.0)
fig.savefig("{}/stat_content_paper_only_nox.png".format(fig_dir), dpi=300, bbox_inches='tight')


# Taxonomy

In [None]:
# ftp://ftp.ncbi.nih.gov/pub/taxonomy/new_taxdump
df_tax = pd.read_csv(ncbi_tax_file, sep="|", header=0, names=['id', '0', '1', '2', '3', '4', '5'], usecols=[0,3,4,5,6,7,9])
for i in df_tax.columns:
    if df_tax[i].dtype == 'object':
        df_tax[i] = df_tax[i].map(str.strip)
df_tax

In [None]:
df_references_tax = pd.merge(df_references.reset_index(), df_dataset.loc[:,['disprot_id', 'ncbi_taxon_id']], left_on='disprot_id', right_on='disprot_id', how='left')
df_references_tax = pd.merge(df_references_tax, df_tax, left_on='ncbi_taxon_id', right_on='id', how='left')
df_references_tax

In [None]:
challenges = ['disorder_nox', 'disorder_pdb', 'binding']
df_ = df_references_tax.loc[df_references_tax['reference'].isin(challenges),['reference', 'disprot_id', '5']].drop_duplicates().groupby(['reference', '5']).count()
df_ = df_.reset_index()
df_ = df_.pivot(index='5', columns='reference', values='disprot_id')
df_

In [None]:
plt.rcParams.update({'font.size': 22, 'xtick.labelsize': 22, 'ytick.labelsize': 22})
axes = df_.plot.pie(subplots=True, figsize=(20, 10), explode=(0.05, 0.05, 0.05), legend=False, layout=(1,3), autopct='%1.1f%%', startangle=30)
for ax, title in zip(axes.reshape(-1), challenges):
    ax.set_title(title, pad=10, loc='center')
    ax.set_ylabel('')

plt.savefig("{}/stat_tax.png".format(fig_dir), dpi=300, bbox_inches='tight')

# Dataset size

In [None]:
# https://towardsdatascience.com/professional-venn-diagrams-in-python-638abfff39cc
plt.rcParams.update({'font.size': 22, 'grid.linestyle': '-', 'grid.color': 'gray' })

from matplotlib_venn import venn3, venn3_circles

colors = ['#2077B5', '#FF7F0D', '#D72828']
labels = ['disorder_pdb', 'disorder_nox', 'binding']
df_ = df_references.reset_index()
sets = [set(df_.loc[df_['reference'] == label, 'disprot_id']) for label in labels]

plt.figure(figsize=(7, 7))
v = venn3(subsets=sets, set_labels=labels, alpha=.0)

labels = ['100', '101', '110', '010', '001', '011', '111']
positions = [(1.2, 1), (.9, .85), (1, 1), (1, 1), (1, 1), (1, 1), (1, 1)]
for i, (label, pos) in enumerate(zip(labels, positions)):
    lbl = v.get_label_by_id(label)
    if lbl:
        x, y = lbl.get_position()
        # print(label, x, y)
        lbl.set_position((x * pos[0], y * pos[1]))

for i, (text, pos) in enumerate(zip(v.set_labels, [(1.3, 0.8), (1.2, 1), (3.5, .7)])):
    x, y = text.get_position()
    print(text, x, y)
    text.set_position((x * pos[0], y * pos[1]))
    text.set_color(colors[i])

circles = venn3_circles(subsets=sets)
for color, circle in zip(colors, circles):
    circle.set_edgecolor(color)
    circle.set_linewidth(6)

plt.savefig("{}/stat_venn.png".format(fig_dir), dpi=300, bbox_inches='tight')

# Evidence distribution

In [None]:
df_dataset

In [None]:
def expand_region(df_:pd.DataFrame, start_col:str='start', end_col:str='end', res_col:str='reg_position') -> pd.DataFrame:
    df_[res_col] = list(range(int(df_[start_col]), int(df_[end_col]) + 1, 1))
    return df_

In [None]:
df_pos = df_dataset.apply(expand_region, axis=1).explode("reg_position")
df_pos

In [None]:
df_1 = df_pos.groupby(['challenge', 'disprot_id', 'ec_name'], as_index=False)['reg_position'].nunique().groupby(['challenge', 'ec_name'], as_index=False)[['ec_name', 'reg_position']].sum()
# df_1 = df_pos.groupby(['challenge', 'ec_name'], as_index=False)['reg_position'].count()  # count also residues annotated with the same technique multiple times
df_1

In [None]:
df_2 = df_pos.groupby(['challenge', 'disprot_id'], as_index=False)['reg_position'].nunique().groupby(['challenge'], as_index=False)['reg_position'].sum()
df_2

In [None]:
df_ec = pd.merge(df_1, df_2, on=['challenge']).rename(
    {'reg_position_x': 'count', 'reg_position_y': 'total'}, axis=1).sort_values(by=['challenge', 'count'], ascending=[True, True])
df_ec

In [None]:
df_ec['fraction'] = df_ec['count'] / df_ec['total']
df_ec

In [None]:
for challenge, df_g in df_ec.groupby('challenge'):
    print(challenge)
    fig, ax = plt.subplots(figsize=(10, 10))
    df_g['ec_name'] = df_g['ec_name'].map(lambda x: x.split('evidence')[0].split('used')[0])
    df_g.set_index('ec_name')['fraction'].iloc[-20:].plot(kind='barh', ax=ax)
    ax.xaxis.grid()
    ax.set_xlabel('Coverage', labelpad=30)

    # plt.title(challenge)
    fig.gca().set_ylabel("")
    ax.set_ybound(-1, 20)

    fig.gca().set_ylabel("")
    fig.savefig("{}/stat_ec_{}.png".format(fig_dir, challenge), dpi=300, bbox_inches='tight')
    # break

In [None]:
challenges_list = ['disorder', 'binding']

# Get labels and colors
# labels = set()
# for challenge, df_g in df_ec.groupby('challenge'):
#     if challenge in challenges_list:
#         df_g['ec_name'] = df_g['ec_name'].map(lambda x: x.split('evidence')[0].split('used')[0])
#         labels.update(set(df_g.sort_values('fraction', ascending=False).iloc[:20]['ec_name']))
# print(len(labels))
# print("\n".join(labels))

# tab20
#1f77b4
#aec7e8

#ff7f0e
#ffbb78

#2ca02c
#98df8a

#d62728
#ff9896

#9467bd
#c5b0d5

#8c564b
#c49c94

#e377c2
#f7b6d2

#7f7f7f
#c7c7c7

#bcbd22
#dbdb8d

#17becf
#9edae5

colors = {


"nuclear magnetic resonance":"#1f77b4",
"nuclear magnetic resonance imaging":"#1f77b4",
"nuclear magnetic resonance spectroscopy-based hydrogen-deuterium exchange":"#1f77b4",
"proton-based nuclear magnetic resonance":"#1f77b4",
"heteronuclear single quantum coherence spectroscopy":"#1f77b4",
"nuclear magnetic resonance spectroscopy": "#1f77b4",

"qualitative western immunoblotting":"#ff7f0e",
"multiplex bead-based immunoassay":"#ff7f0e",
"co-immunoprecipitation": "#ff7f0e",
"immunoprecipitation": "#ff7f0e",
"electrophoretic mobility shift assay": "#ff7f0e",
"sodium dodecyl sulfate polyacrylamide gel electrophoresis": "#ff7f0e",
"gel-filtration": "#ff7f0e",
"glutathione S-transferase pull-down assay":"#ff7f0e",
"affinity chromatography":"#ff7f0e",

"microscopy": "#2ca02c",
"electron microscopy":"#2ca02c",
"cryogenic electron microscopy":"#2ca02c",
"cryogenic electron microscopy-based structural model with missing residue coordinates": "#2ca02c",

"isothermal titration calorimetry": "#d62728",
"microscale thermophoresis":"#d62728",

"fluorescence":"#9467bd",
"yellow fluorescent protein fusion protein localization":"#9467bd",
"intrinsic fluorescence-based protein conformation": "#9467bd",

"far-UV circular dichroism":"#8c564b",

"Fourier-transform infrared spectroscopy":"#e377c2",

"author inference":"#7f7f7f",
"combinatorial experimental and author inference":"#7f7f7f",

"X-ray crystallography-based structural model with missing residue coordinates":"#bcbd22",

"small-angle X-ray scattering":"#17becf",
"dynamic light scattering assay":"#17becf",

"yeast 2-hybrid":"#aec7e8",


"in vitro assay":"#ffbb78",

"protein fragment functional complementation":"#98df8a",

"cross-linking": "#ff9896",

"intrinsic disorder prediction": "#c5b0d5",
}

fig, axes = plt.subplots(nrows=2, ncols=1, figsize=(15, 25))
for challenge, df_g in df_ec.groupby('challenge'):
    if challenge in challenges_list:
        i = challenges_list.index(challenge)

        df_g['ec_name'] = df_g['ec_name'].map(lambda x: x.split('evidence')[0].split('used')[0])
        df_ = df_g.set_index('ec_name')['fraction'].iloc[-20:]
        print(df_.index)
        colors_ = [colors[ec_name.strip()] for ec_name in df_.index]
        df_.plot(kind='barh', ax=axes[i], color=colors_)  # plt.cm.tab20c.colors

        axes[i].xaxis.grid()
        if i > 0:
            axes[i].set_xlabel('Coverage', labelpad=30)

        # plt.title(challenge)
        axes[i].set_ybound(-1, 20)
        axes[i].set_xlim(0,0.35)

        axes[i].set_ylabel("")
fig.savefig("{}/stat_ec_paper.png".format(fig_dir), dpi=300, bbox_inches='tight')


In [None]:
df_pos

In [None]:
df_ = df_pos.groupby(['challenge', 'disprot_id', 'reg_position'], as_index=False)['ec_name'].unique()
df_

In [None]:
df__ = df_.loc[(df_['ec_name'].str.len() == 1)]
df__['ec_name'] = df__['ec_name'].str[0]
df__ = df__.groupby(['challenge', 'ec_name'], as_index=False)['reg_position'].nunique()
df__

In [None]:
df__.loc[df__['challenge'] == 'disorder'].sort_values('reg_position', ascending=False)

In [None]:
df__.groupby(['challenge'], as_index=False)['reg_position'].sum()