## Merge reference data with prediction agreement

In [1]:
import os
import pandas as pd
import numpy as np
import requests

In [2]:
dataset_file = "../data/output/dataset.tsv"
prediction_dir = "../data/predictions"  # scp -r urano:/projects/CAID2/predictions .
predictions_references_file = "../data/predictions_references.tsv"

assessment_disorder_file = "../data/disorder.analysis.all.dataset.f1s.metrics.csv"
assessment_binding_file = "../data/binding.analysis.all.dataset.f1s.metrics.csv"

alphafold_dir = "../data/alphafold"

# Output
dataset_merge_file = "../data/output/dataset_merge.tsv"

In [3]:
# It assumes one method participate in only one challenge
df_challenge = pd.read_csv(predictions_references_file, sep="\t")
df_challenge
# df_challenge[df_challenge['Method'] == 'ENSHROUD-all']

Unnamed: 0,Method,disorder,binding
0,AIUPred,1,0
1,AlphaFold-dis,1,0
2,AlphaFold-dis25,1,0
3,ANCHOR2,0,1
4,APOD,1,0
...,...,...,...
64,SETH-1,1,0
65,SPOT-Disorder,1,0
66,SPOT-Disorder-Single,1,0
67,SPOT-Disorder2,1,0


In [27]:
df_1 = pd.read_csv(assessment_disorder_file, index_col=0).reset_index().rename(columns={'index': 'method'})[['method', 'thr']]
df_1['challenge'] = 'disorder'

df_2 = pd.read_csv(assessment_binding_file, index_col=0).reset_index().rename(columns={'index': 'method'})[['method', 'thr']]
df_2['challenge'] = 'binding'
df_2

df_ = pd.concat([df_1, df_2])

df_thresholds = df_.loc[df_['method'].isin(df_challenge[df_challenge['disorder'] == 1]['Method']) & (df_['challenge'] == 'disorder')]
df_thresholds = pd.concat([df_thresholds, df_.loc[df_['method'].isin(df_challenge[df_challenge['binding'] == 1]['Method']) & (df_['challenge'] == 'binding')]])
df_thresholds['thr'].astype('float32')
df_thresholds

Unnamed: 0,method,thr,challenge
0,ESpritz-D,0.414,disorder
1,ESpritz-X,0.102,disorder
2,ESpritz-N,0.453,disorder
3,DisEMBL-dis465,0.448,disorder
4,DisEMBL-disHL,0.107,disorder
...,...,...,...
22,OPAL,0.535,binding
23,DISOPRED3-bind,0.000,binding
24,ProBiPred-protein,0.002,binding
25,ProBiPred-nucleic,0.216,binding


In [5]:
df_list = []
for pred_file in os.listdir(prediction_dir):
    pred_name = pred_file[:-5]  
    if pred_name in df_thresholds['method'].values:
#         print(pred_name)
        data = []
        with open("{}/{}".format(prediction_dir, pred_file)) as f:
            for line in f:
                if line[0] == ">":
                    disprot_id = line[1:].strip()
                else:
                    data.append([pred_name, disprot_id] + line[:-1].split("\t"))     
        df_ = pd.DataFrame(data, columns=['method', 'disprot_id', 'pos', 'aa', 'score', 'class'])
#         print(df_)
        if df_['score'].values[0] == "":
            df_['class'] = df_['class'].astype(int)
        else:
            if (df_['score'] == "").any():
                print("Empty score", pred_name)
                df_.loc[df_['score'] == "", 'score'] = np.nan
            df_['score'] = df_['score'].astype('float32')
            df_['class'] = 0
            df_.loc[df_['score'] >= df_thresholds.loc[df_thresholds['method'] == pred_name, 'thr'].iloc[0], 'class'] = 1
        df_.drop(columns=['aa', 'score'], inplace=True)
        df_list.append(df_)
    else:
        print("missing method threshold", pred_name)

df_pred = pd.concat(df_list)    
df_pred

missing method threshold ENSHROUD-nucleic
Empty score DisoBindPred
missing method threshold DeepDISOBind-nucleic
missing method threshold DeepDISOBind-protein
missing method threshold ENSHROUD-protein
missing method threshold AIUPred


Unnamed: 0,method,disprot_id,pos,class
0,flDPtr,DP02342,1,1
1,flDPtr,DP02342,2,1
2,flDPtr,DP02342,3,1
3,flDPtr,DP02342,4,1
4,flDPtr,DP02342,5,1
...,...,...,...,...
301237,DFLpred,DP03746,1280,1
301238,DFLpred,DP03746,1281,1
301239,DFLpred,DP03746,1282,1
301240,DFLpred,DP03746,1283,1


In [6]:
# Fix bug in SETH-O position column
df_pred['pos'] = df_pred['pos'].astype(int)
df_pred.loc[df_pred['method'] == 'SETH-0', ['pos']] += 1

In [7]:
df_pred = pd.merge(df_pred, df_challenge, left_on=['method'], right_on=['Method'], how='left').drop(columns='Method')
df_pred

Unnamed: 0,method,disprot_id,pos,class,disorder,binding
0,flDPtr,DP02342,1,1,1,0
1,flDPtr,DP02342,2,1,1,0
2,flDPtr,DP02342,3,1,1,0
3,flDPtr,DP02342,4,1,1,0
4,flDPtr,DP02342,5,1,1,0
...,...,...,...,...,...,...
18581534,DFLpred,DP03746,1280,1,0,1
18581535,DFLpred,DP03746,1281,1,0,1
18581536,DFLpred,DP03746,1282,1,0,1
18581537,DFLpred,DP03746,1283,1,0,1


In [8]:
df_pred_binding = pd.pivot_table(df_pred.loc[df_pred['binding'] == 1], index=['disprot_id', 'pos'], columns=['method'], values=['class'])
df_pred_disorder = pd.pivot_table(df_pred.loc[df_pred['disorder'] == 1], index=['disprot_id', 'pos'], columns=['method'], values=['class'])

In [9]:
df_pred_binding.columns = df_pred_binding.columns.set_levels(['binding'], level=0)
df_pred_disorder.columns = df_pred_disorder.columns.set_levels(['disorder'], level=0)
df_pred = pd.merge(df_pred_disorder, df_pred_binding, on=['disprot_id', 'pos'])
df_pred

Unnamed: 0_level_0,Unnamed: 1_level_0,disorder,disorder,disorder,disorder,disorder,disorder,disorder,disorder,disorder,disorder,...,binding,binding,binding,binding,binding,binding,binding,binding,binding,binding
Unnamed: 0_level_1,method,APOD,AUCpred-no-profile,AUCpred-profile,AlphaFold-dis,AlphaFold-dis25,DISOPRED3-diso,DeepIDP-2L,DisEMBL-dis465,DisEMBL-disHL,DisoPred,...,MoRFchibi-mc,MoRFchibi-mcl,MoRFchibi-mcw,OPAL,ProBiPred-nucleic,ProBiPred-protein,bindEmbed21IDR-idrGeneral,bindEmbed21IDR-idrNuc,bindEmbed21IDR-rawGeneral,bindEmbed21IDR-rawNuc
disprot_id,pos,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2
DP02342,1,0.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0
DP02342,2,0.0,1.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0
DP02342,3,0.0,1.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0
DP02342,4,0.0,1.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0
DP02342,5,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,...,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
DP03746,1280,0.0,1.0,1.0,,,0.0,1.0,0.0,1.0,0.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
DP03746,1281,0.0,1.0,1.0,,,0.0,1.0,0.0,1.0,0.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
DP03746,1282,0.0,1.0,1.0,,,0.0,1.0,0.0,1.0,0.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
DP03746,1283,0.0,1.0,1.0,,,0.0,1.0,1.0,0.0,0.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [10]:
df_dataset = pd.read_csv(dataset_file, sep="\t")
df_dataset

Unnamed: 0,disprot_id,acc,pos,aa,disorder,linker,transition,nucleic acid binding,binding,protein binding,pdb
0,DP02342,P06837,1,M,1.0,,,,,,
1,DP02342,P06837,2,L,1.0,,,,,,
2,DP02342,P06837,3,C,1.0,,,,,,
3,DP02342,P06837,4,C,1.0,,,,,,
4,DP02342,P06837,5,M,1.0,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...
378005,DP03896,Q13635,1443,G,1.0,,,,,,
378006,DP03896,Q13635,1444,S,1.0,,,,,,
378007,DP03896,Q13635,1445,S,1.0,,,,,,
378008,DP03896,Q13635,1446,S,1.0,,,,,,


In [11]:
# df_dataset.columns.set_levels(['reference'], level=0)
df_dataset.columns = pd.MultiIndex.from_product([['reference'], df_dataset.columns])
df_dataset

Unnamed: 0_level_0,reference,reference,reference,reference,reference,reference,reference,reference,reference,reference,reference
Unnamed: 0_level_1,disprot_id,acc,pos,aa,disorder,linker,transition,nucleic acid binding,binding,protein binding,pdb
0,DP02342,P06837,1,M,1.0,,,,,,
1,DP02342,P06837,2,L,1.0,,,,,,
2,DP02342,P06837,3,C,1.0,,,,,,
3,DP02342,P06837,4,C,1.0,,,,,,
4,DP02342,P06837,5,M,1.0,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...
378005,DP03896,Q13635,1443,G,1.0,,,,,,
378006,DP03896,Q13635,1444,S,1.0,,,,,,
378007,DP03896,Q13635,1445,S,1.0,,,,,,
378008,DP03896,Q13635,1446,S,1.0,,,,,,


## AlphaFold

Generate AlphaFold predictions, see "alphafold" notebook

In [13]:
df_af = pd.read_csv("{}/af_pred.tsv".format(alphafold_dir), sep="\t")
df_af = df_af.rename(columns={"name": "acc", "disorder": "af-disorder", "disorder-25": "af-rsa", "binding-25-0.581": "af-binding"})
df_af = df_af[["acc", "pos", "aa", "af-disorder", "af-rsa", "af-binding"]]
df_af = df_af.set_index(['acc', 'pos', 'aa'])
df_af.columns = pd.MultiIndex.from_product([['alphafold'], df_af.columns])
df_af

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,alphafold,alphafold,alphafold
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,af-disorder,af-rsa,af-binding
acc,pos,aa,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
A0A090N8E9,1,M,0.672,0.792,0.718
A0A090N8E9,2,G,0.642,0.784,0.731
A0A090N8E9,3,Q,0.693,0.756,0.710
A0A090N8E9,4,T,0.646,0.756,0.729
A0A090N8E9,5,G,0.692,0.769,0.710
...,...,...,...,...,...
S8F2K7,519,E,0.663,0.770,0.722
S8F2K7,520,R,0.685,0.780,0.713
S8F2K7,521,Q,0.706,0.792,0.704
S8F2K7,522,H,0.657,0.767,0.725


In [14]:
df_dataset = pd.merge(df_dataset, df_af, left_on=[('reference', 'acc'), ('reference', 'pos'), ('reference', 'aa')], right_on=['acc', 'pos', 'aa'], how='left')
df_dataset

Unnamed: 0_level_0,reference,reference,reference,reference,reference,reference,reference,reference,reference,reference,reference,alphafold,alphafold,alphafold
Unnamed: 0_level_1,disprot_id,acc,pos,aa,disorder,linker,transition,nucleic acid binding,binding,protein binding,pdb,af-disorder,af-rsa,af-binding
0,DP02342,P06837,1,M,1.0,,,,,,,0.270,0.897,0.887
1,DP02342,P06837,2,L,1.0,,,,,,,0.266,0.891,0.889
2,DP02342,P06837,3,C,1.0,,,,,,,0.256,0.885,0.893
3,DP02342,P06837,4,C,1.0,,,,,,,0.296,0.878,0.876
4,DP02342,P06837,5,M,1.0,,,,,,,0.292,0.873,0.878
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
378005,DP03896,Q13635,1443,G,1.0,,,,,,,0.618,0.907,0.741
378006,DP03896,Q13635,1444,S,1.0,,,,,,,0.648,0.911,0.729
378007,DP03896,Q13635,1445,S,1.0,,,,,,,0.676,0.910,0.717
378008,DP03896,Q13635,1446,S,1.0,,,,,,,0.636,0.918,0.733


In [15]:
df_dataset = pd.merge(df_dataset, df_pred, left_on=[('reference', 'disprot_id'), ('reference', 'pos')], right_on=['disprot_id', 'pos'], how='left')
df_dataset

Unnamed: 0_level_0,reference,reference,reference,reference,reference,reference,reference,reference,reference,reference,...,binding,binding,binding,binding,binding,binding,binding,binding,binding,binding
Unnamed: 0_level_1,disprot_id,acc,pos,aa,disorder,linker,transition,nucleic acid binding,binding,protein binding,...,MoRFchibi-mc,MoRFchibi-mcl,MoRFchibi-mcw,OPAL,ProBiPred-nucleic,ProBiPred-protein,bindEmbed21IDR-idrGeneral,bindEmbed21IDR-idrNuc,bindEmbed21IDR-rawGeneral,bindEmbed21IDR-rawNuc
0,DP02342,P06837,1,M,1.0,,,,,,...,0.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0
1,DP02342,P06837,2,L,1.0,,,,,,...,0.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0
2,DP02342,P06837,3,C,1.0,,,,,,...,0.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0
3,DP02342,P06837,4,C,1.0,,,,,,...,0.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0
4,DP02342,P06837,5,M,1.0,,,,,,...,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
378005,DP03896,Q13635,1443,G,1.0,,,,,,...,,,,,,,,,,
378006,DP03896,Q13635,1444,S,1.0,,,,,,...,,,,,,,,,,
378007,DP03896,Q13635,1445,S,1.0,,,,,,...,,,,,,,,,,
378008,DP03896,Q13635,1446,S,1.0,,,,,,...,,,,,,,,,,


In [16]:
df_dataset.set_index([('reference', 'disprot_id'), ('reference', 'acc'), ('reference', 'pos'), ('reference', 'aa')], append=True, inplace=True)
df_dataset.index.names = [None, 'disprot_id', 'acc', 'pos', 'aa']
df_dataset

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,reference,reference,reference,reference,reference,reference,reference,alphafold,alphafold,alphafold,...,binding,binding,binding,binding,binding,binding,binding,binding,binding,binding
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,disorder,linker,transition,nucleic acid binding,binding,protein binding,pdb,af-disorder,af-rsa,af-binding,...,MoRFchibi-mc,MoRFchibi-mcl,MoRFchibi-mcw,OPAL,ProBiPred-nucleic,ProBiPred-protein,bindEmbed21IDR-idrGeneral,bindEmbed21IDR-idrNuc,bindEmbed21IDR-rawGeneral,bindEmbed21IDR-rawNuc
Unnamed: 0_level_2,disprot_id,acc,pos,aa,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2,Unnamed: 23_level_2,Unnamed: 24_level_2,Unnamed: 25_level_2
0,DP02342,P06837,1,M,1.0,,,,,,,0.270,0.897,0.887,...,0.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0
1,DP02342,P06837,2,L,1.0,,,,,,,0.266,0.891,0.889,...,0.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0
2,DP02342,P06837,3,C,1.0,,,,,,,0.256,0.885,0.893,...,0.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0
3,DP02342,P06837,4,C,1.0,,,,,,,0.296,0.878,0.876,...,0.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0
4,DP02342,P06837,5,M,1.0,,,,,,,0.292,0.873,0.878,...,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
378005,DP03896,Q13635,1443,G,1.0,,,,,,,0.618,0.907,0.741,...,,,,,,,,,,
378006,DP03896,Q13635,1444,S,1.0,,,,,,,0.648,0.911,0.729,...,,,,,,,,,,
378007,DP03896,Q13635,1445,S,1.0,,,,,,,0.676,0.910,0.717,...,,,,,,,,,,
378008,DP03896,Q13635,1446,S,1.0,,,,,,,0.636,0.918,0.733,...,,,,,,,,,,


In [None]:
df_dataset.to_csv(dataset_merge_file, sep="\t")