In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
dataset_merge_file = "../data/output/dataset_merge.tsv"

# output
figures_dir = "../data/output/figures"

In [3]:
df_dataset = pd.read_csv(dataset_merge_file, sep="\t")
df_dataset

Unnamed: 0,disprot_id,acc,pos,aa,disorder,linker,transition,nucleic acid binding,binding,protein binding,af-binding,af-disorder,af-rsa,pdb,disorder-mean,binding-mean
0,DP02342,P06837,1,M,1.0,,,,,,0.887,0.270,0.897,,0.815789,0.888889
1,DP02342,P06837,2,L,1.0,,,,,,0.889,0.266,0.891,,0.763158,0.814815
2,DP02342,P06837,3,C,1.0,,,,,,0.893,0.256,0.885,,0.736842,0.814815
3,DP02342,P06837,4,C,1.0,,,,,,0.876,0.296,0.878,,0.789474,0.851852
4,DP02342,P06837,5,M,1.0,,,,,,0.878,0.292,0.873,,0.815789,0.888889
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
297691,DP03758,Q96ST2,815,N,,,,,,,,,,,,
297692,DP03758,Q96ST2,816,K,,,,,,,,,,,,
297693,DP03758,Q96ST2,817,M,,,,,,,,,,,,
297694,DP03758,Q96ST2,818,P,,,,,,,,,,,,


In [4]:
# Modify reference values for the figure offset
df_dataset['af-plddt'] = 1 - df_dataset['af-disorder']
df_dataset.loc[df_dataset['disorder'] == 1, 'disorder'] = 1.3
df_dataset.loc[df_dataset['binding'] == 1, 'binding'] = 1.2
df_dataset.loc[df_dataset['pdb'] == 1, 'pdb'] = 1.1
df_dataset

Unnamed: 0,disprot_id,acc,pos,aa,disorder,linker,transition,nucleic acid binding,binding,protein binding,af-binding,af-disorder,af-rsa,pdb,disorder-mean,binding-mean,af-plddt
0,DP02342,P06837,1,M,1.3,,,,,,0.887,0.270,0.897,,0.815789,0.888889,0.730
1,DP02342,P06837,2,L,1.3,,,,,,0.889,0.266,0.891,,0.763158,0.814815,0.734
2,DP02342,P06837,3,C,1.3,,,,,,0.893,0.256,0.885,,0.736842,0.814815,0.744
3,DP02342,P06837,4,C,1.3,,,,,,0.876,0.296,0.878,,0.789474,0.851852,0.704
4,DP02342,P06837,5,M,1.3,,,,,,0.878,0.292,0.873,,0.815789,0.888889,0.708
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
297691,DP03758,Q96ST2,815,N,,,,,,,,,,,,,
297692,DP03758,Q96ST2,816,K,,,,,,,,,,,,,
297693,DP03758,Q96ST2,817,M,,,,,,,,,,,,,
297694,DP03758,Q96ST2,818,P,,,,,,,,,,,,,


In [8]:
# Generate all the figures
for name, df_g in df_dataset.groupby(['disprot_id', 'acc']):
    df_g = df_g.loc[:, ['pos', 'disorder', 'binding', 'pdb', 'binding-mean', 'disorder-mean', 'af-plddt']]
    ax = df_g.plot(x='pos', ylim=(0,1.4), grid=True, figsize=(30,8), fontsize=18)
    ax.set_title(" - ".join(name), fontdict={'fontsize':18})
    ax.xaxis.label.set_visible(False)
    style=['-', '-', '-', '--', '--', '-']
    lw=[6, 6, 6, 2, 2, 2]
    for i, l in enumerate(ax.lines):
        plt.setp(l, linestyle=style[i], linewidth=lw[i])
    ax.legend(prop={'size': 18}, loc='center left', bbox_to_anchor=(1.0, 0.5))
    fig = ax.get_figure()
    fig.savefig("{}/{}.png".format(figures_dir, "_".join(name)), bbox_inches='tight')
    plt.close(fig)

In [6]:
# DisProt disorder, AF order, predictors disorder
accs = df_dataset[(df_dataset['disorder'] >= 1) & (df_dataset['af-plddt'] > 0.7) & (df_dataset['disorder-mean'] >= 0.9)]['acc'].unique()

# 
accs = df_dataset[(df_dataset['disorder'] >= 1) & (df_dataset['pdb'] >= 1.0) & (df_dataset['binding'] >= 1.0)]['acc'].unique()

df_ = df_dataset.loc[df_dataset['acc'].isin(accs)]

In [7]:
df_['acc'].unique()

array(['P06837', 'Q8N5F7', 'P30130', 'P16070', 'P09077', 'P25298',
       'P18887', 'P46379', 'O74986', 'O43521', 'P40381', 'Q8NBI3',
       'Q9UIF9', 'Q9BYG3', 'Q7T2G3', 'Q9WIK7', 'Q9QR71', 'Q8JUX6',
       'Q9JZ38', 'P55127', 'Q9JZN9', 'Q9JZ10', 'Q9QR99', 'P81605',
       'D3TI84', 'Q4LE39', 'Q13480', 'O96018', 'A7KHA7', 'P24588',
       'P32597', 'Q96ST2'], dtype=object)