In [96]:
import os
import pandas as pd
import numpy as np
import requests

In [97]:
dataset_file = "../data/output/dataset.tsv"
prediction_dir = "../data/predictions"
predictions_references_file = "../data/methods_class.tsv"

assessment_disorder_file = "../data/assessment_results/disorder.analysis.all.dataset.f1s.metrics.csv"
assessment_binding_file = "../data/assessment_results/binding.analysis.all.dataset.f1s.metrics.csv"

# Output
dataset_merge_file = "../data/output/dataset_merge.tsv"

In [98]:
# It assumes one method participate in only one challenge
df_challenge = pd.read_csv(predictions_references_file, sep="\t")
df_challenge
# df_challenge[df_challenge['Method'] == 'ENSHROUD-all']

Unnamed: 0,Group,Method,disorder,binding,linker
0,AIUPred,AIUPred,1,0,0
1,AlphaFold-disorder,AlphaFold-binding,0,1,1
2,AlphaFold-disorder,AlphaFold-disorder,1,0,0
3,AlphaFold-disorder,AlphaFold-rsa,1,0,0
4,ANCHOR2,ANCHOR2,0,1,1
...,...,...,...,...,...
66,SETH_1,SETH-1,1,0,0
67,SPOT-Disorder,SPOT-Disorder,1,0,0
68,SPOT-Disorder-Single,SPOT-Disorder-Single,1,0,0
69,SPOT-Disorder2,SPOT-Disorder2,1,0,0


In [99]:
df_1 = pd.read_csv(assessment_disorder_file, index_col=0).reset_index().rename(columns={'index': 'method'})[['method', 'thr']]
df_1['challenge'] = 'disorder'

df_2 = pd.read_csv(assessment_binding_file, index_col=0).reset_index().rename(columns={'index': 'method'})[['method', 'thr']]
df_2['challenge'] = 'binding'

df_ = pd.concat([df_1, df_2])

df_thresholds = df_.loc[df_['method'].isin(df_challenge[df_challenge['disorder'] == 1]['Method']) & (df_['challenge'] == 'disorder')]
df_thresholds = pd.concat([df_thresholds, df_.loc[df_['method'].isin(df_challenge[df_challenge['binding'] == 1]['Method']) & (df_['challenge'] == 'binding')]])
df_thresholds['thr'].astype('float32')
df_thresholds

Unnamed: 0,method,thr,challenge
0,AIUPred,0.702,disorder
1,AlphaFold-disorder,0.331,disorder
2,AlphaFold-rsa,0.592,disorder
3,APOD,0.375,disorder
4,AUCpred-no-profile,0.080,disorder
...,...,...,...
26,MoRFchibi-mcl,0.600,binding
27,MoRFchibi-mcw,0.598,binding
28,OPAL,0.533,binding
29,ProBiPred-nucleic,0.216,binding


In [100]:
df_list = []
for pred_file in os.listdir(prediction_dir):
    pred_name = pred_file[:-5]  
    if pred_name in df_thresholds['method'].values:
        # print(pred_name)
        data = []
        with open("{}/{}".format(prediction_dir, pred_file)) as f:
            for line in f:
                if line[0] == ">":
                    disprot_id = line[1:].strip()
                else:
                    data.append([pred_name, disprot_id] + line[:-1].split("\t"))

        if len(data[0]) == 6:
            df_ = pd.DataFrame(data, columns=['method', 'disprot_id', 'pos', 'aa', 'score', 'class'])
        else:
            df_ = pd.DataFrame(data, columns=['method', 'disprot_id', 'pos', 'aa', 'score'])
            df_['class'] = np.nan

        if (df_['class'] == "").any():
            print("Empty class, filling with nan", pred_name)
            df_.loc[df_['class'] == "", 'class'] = np.nan
        df_['class'] = df_['class'].astype('float32')

        if (df_['score'] == "").any():
            print("Empty score, filling with nan", pred_name)
            df_.loc[df_['score'] == "", 'score'] = np.nan
        df_['score'] = df_['score'].astype('float32')

        df_.loc[df_['score'] >= df_thresholds.loc[df_thresholds['method'] == pred_name, 'thr'].iloc[0], 'class'] = 1

        if 'AlphaFold' in pred_name:
            df_['class'] = df_['score']

        df_.drop(columns=['aa', 'score'], inplace=True)
        df_list.append(df_)
    else:
        print("missing method threshold", pred_name)

df_pred = pd.concat(df_list)    
df_pred

Empty score, filling with nan FoldUnfold
Empty score, filling with nan DisoBindPred
Empty class, filling with nan OPAL
Empty class, filling with nan rawMSA
Empty class, filling with nan MoRFchibi-mc
Empty class, filling with nan disomine
Empty class, filling with nan DisEMBL-dis465
Empty class, filling with nan RONN
Empty class, filling with nan DisEMBL-disHL


Unnamed: 0,method,disprot_id,pos,class
0,flDPtr,DP02342,1,1.0
1,flDPtr,DP02342,2,1.0
2,flDPtr,DP02342,3,1.0
3,flDPtr,DP02342,4,1.0
4,flDPtr,DP02342,5,1.0
...,...,...,...,...
301237,DFLpred,DP03746,1280,1.0
301238,DFLpred,DP03746,1281,1.0
301239,DFLpred,DP03746,1282,1.0
301240,DFLpred,DP03746,1283,1.0


In [102]:
df_pred['pos'] = df_pred['pos'].astype(int)
df_pred = pd.merge(df_pred, df_challenge, left_on=['method'], right_on=['Method'], how='left').drop(columns='Method')
df_pred

Unnamed: 0,method,disprot_id,pos,class,Group,disorder,binding,linker
0,flDPtr,DP02342,1,1.0,flDPtr,1,0,0
1,flDPtr,DP02342,2,1.0,flDPtr,1,0,0
2,flDPtr,DP02342,3,1.0,flDPtr,1,0,0
3,flDPtr,DP02342,4,1.0,flDPtr,1,0,0
4,flDPtr,DP02342,5,1.0,flDPtr,1,0,0
...,...,...,...,...,...,...,...,...
20869988,DFLpred,DP03746,1280,1.0,DFLpred,0,1,1
20869989,DFLpred,DP03746,1281,1.0,DFLpred,0,1,1
20869990,DFLpred,DP03746,1282,1.0,DFLpred,0,1,1
20869991,DFLpred,DP03746,1283,1.0,DFLpred,0,1,1


In [103]:
df_pred_binding = pd.pivot_table(df_pred.loc[df_pred['binding'] == 1], index=['disprot_id', 'pos'], columns=['method'], values=['class'])
df_pred_disorder = pd.pivot_table(df_pred.loc[df_pred['disorder'] == 1], index=['disprot_id', 'pos'], columns=['method'], values=['class'])

In [104]:
df_pred_binding.columns = df_pred_binding.columns.set_levels(['binding'], level=0)
df_pred_disorder.columns = df_pred_disorder.columns.set_levels(['disorder'], level=0)
df_pred = pd.merge(df_pred_disorder, df_pred_binding, on=['disprot_id', 'pos'])
df_pred

Unnamed: 0_level_0,Unnamed: 1_level_0,disorder,disorder,disorder,disorder,disorder,disorder,disorder,disorder,disorder,disorder,...,binding,binding,binding,binding,binding,binding,binding,binding,binding,binding
Unnamed: 0_level_1,method,AIUPred,APOD,AUCpred-no-profile,AUCpred-profile,AlphaFold-disorder,AlphaFold-rsa,DISOPRED3-diso,DeepIDP-2L,DisEMBL-dis465,DisEMBL-disHL,...,MoRFchibi-mc,MoRFchibi-mcl,MoRFchibi-mcw,OPAL,ProBiPred-nucleic,ProBiPred-protein,bindEmbed21IDR-idrGeneral,bindEmbed21IDR-idrNuc,bindEmbed21IDR-rawGeneral,bindEmbed21IDR-rawNuc
disprot_id,pos,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2
DP02342,1,1.0,0.0,1.0,1.0,0.270,0.897,1.0,1.0,1.0,1.0,...,,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0
DP02342,2,1.0,0.0,1.0,1.0,0.266,0.891,0.0,1.0,1.0,1.0,...,,0.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0
DP02342,3,1.0,0.0,1.0,1.0,0.256,0.885,0.0,1.0,1.0,1.0,...,,0.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0
DP02342,4,1.0,0.0,1.0,1.0,0.296,0.878,0.0,1.0,1.0,1.0,...,,0.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0
DP02342,5,1.0,0.0,1.0,1.0,0.292,0.873,0.0,1.0,,1.0,...,,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
DP03896,1443,,,,,0.618,0.907,,,,,...,,,,,,,,,,
DP03896,1444,,,,,0.648,0.911,,,,,...,,,,,,,,,,
DP03896,1445,,,,,0.676,0.910,,,,,...,,,,,,,,,,
DP03896,1446,,,,,0.636,0.918,,,,,...,,,,,,,,,,


In [105]:
df_dataset = pd.read_csv(dataset_file, sep="\t", index_col=False)
df_dataset

Unnamed: 0,disprot_id,acc,pos,aa,disorder,linker,transition,binding nucleic acid,binding,binding protein,disorder_nox,pdb
0,DP02342,P06837,1,M,1.0,,,,,,1.0,
1,DP02342,P06837,2,L,1.0,,,,,,1.0,
2,DP02342,P06837,3,C,1.0,,,,,,1.0,
3,DP02342,P06837,4,C,1.0,,,,,,1.0,
4,DP02342,P06837,5,M,1.0,,,,,,1.0,
...,...,...,...,...,...,...,...,...,...,...,...,...
287015,DP02544,Q04410,368,Q,1.0,,,,,,1.0,
287016,DP02544,Q04410,369,S,1.0,,,,,,1.0,
287017,DP02544,Q04410,370,S,1.0,,,,,,1.0,
287018,DP02544,Q04410,371,S,1.0,,,,,,1.0,


In [106]:
df_dataset.columns = pd.MultiIndex.from_product([['reference'], df_dataset.columns])
df_dataset

Unnamed: 0_level_0,reference,reference,reference,reference,reference,reference,reference,reference,reference,reference,reference,reference
Unnamed: 0_level_1,disprot_id,acc,pos,aa,disorder,linker,transition,binding nucleic acid,binding,binding protein,disorder_nox,pdb
0,DP02342,P06837,1,M,1.0,,,,,,1.0,
1,DP02342,P06837,2,L,1.0,,,,,,1.0,
2,DP02342,P06837,3,C,1.0,,,,,,1.0,
3,DP02342,P06837,4,C,1.0,,,,,,1.0,
4,DP02342,P06837,5,M,1.0,,,,,,1.0,
...,...,...,...,...,...,...,...,...,...,...,...,...
287015,DP02544,Q04410,368,Q,1.0,,,,,,1.0,
287016,DP02544,Q04410,369,S,1.0,,,,,,1.0,
287017,DP02544,Q04410,370,S,1.0,,,,,,1.0,
287018,DP02544,Q04410,371,S,1.0,,,,,,1.0,


In [107]:
df_dataset = pd.merge(df_dataset, df_pred, left_on=[('reference', 'disprot_id'), ('reference', 'pos')], right_on=['disprot_id', 'pos'], how='left')
df_dataset

Unnamed: 0_level_0,reference,reference,reference,reference,reference,reference,reference,reference,reference,reference,...,binding,binding,binding,binding,binding,binding,binding,binding,binding,binding
Unnamed: 0_level_1,disprot_id,acc,pos,aa,disorder,linker,transition,binding nucleic acid,binding,binding protein,...,MoRFchibi-mc,MoRFchibi-mcl,MoRFchibi-mcw,OPAL,ProBiPred-nucleic,ProBiPred-protein,bindEmbed21IDR-idrGeneral,bindEmbed21IDR-idrNuc,bindEmbed21IDR-rawGeneral,bindEmbed21IDR-rawNuc
0,DP02342,P06837,1,M,1.0,,,,,,...,,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0
1,DP02342,P06837,2,L,1.0,,,,,,...,,0.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0
2,DP02342,P06837,3,C,1.0,,,,,,...,,0.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0
3,DP02342,P06837,4,C,1.0,,,,,,...,,0.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0
4,DP02342,P06837,5,M,1.0,,,,,,...,,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
287015,DP02544,Q04410,368,Q,1.0,,,,,,...,1.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0
287016,DP02544,Q04410,369,S,1.0,,,,,,...,,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0
287017,DP02544,Q04410,370,S,1.0,,,,,,...,,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0
287018,DP02544,Q04410,371,S,1.0,,,,,,...,,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0


In [108]:
df_dataset.set_index([('reference', 'disprot_id'), ('reference', 'acc'), ('reference', 'pos'), ('reference', 'aa')], append=True, inplace=True)
df_dataset.index.names = [None, 'disprot_id', 'acc', 'pos', 'aa']
df_dataset

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,reference,reference,reference,reference,reference,reference,reference,reference,disorder,disorder,...,binding,binding,binding,binding,binding,binding,binding,binding,binding,binding
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,disorder,linker,transition,binding nucleic acid,binding,binding protein,disorder_nox,pdb,AIUPred,APOD,...,MoRFchibi-mc,MoRFchibi-mcl,MoRFchibi-mcw,OPAL,ProBiPred-nucleic,ProBiPred-protein,bindEmbed21IDR-idrGeneral,bindEmbed21IDR-idrNuc,bindEmbed21IDR-rawGeneral,bindEmbed21IDR-rawNuc
Unnamed: 0_level_2,disprot_id,acc,pos,aa,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2,Unnamed: 23_level_2,Unnamed: 24_level_2,Unnamed: 25_level_2
0,DP02342,P06837,1,M,1.0,,,,,,1.0,,1.0,0.0,...,,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0
1,DP02342,P06837,2,L,1.0,,,,,,1.0,,1.0,0.0,...,,0.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0
2,DP02342,P06837,3,C,1.0,,,,,,1.0,,1.0,0.0,...,,0.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0
3,DP02342,P06837,4,C,1.0,,,,,,1.0,,1.0,0.0,...,,0.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0
4,DP02342,P06837,5,M,1.0,,,,,,1.0,,1.0,0.0,...,,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
287015,DP02544,Q04410,368,Q,1.0,,,,,,1.0,,1.0,1.0,...,1.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0
287016,DP02544,Q04410,369,S,1.0,,,,,,1.0,,1.0,1.0,...,,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0
287017,DP02544,Q04410,370,S,1.0,,,,,,1.0,,1.0,0.0,...,,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0
287018,DP02544,Q04410,371,S,1.0,,,,,,1.0,,1.0,0.0,...,,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0


In [109]:
df_dataset.to_csv(dataset_merge_file, sep="\t")