## Merge reference data with prediction agreement

In [1]:
import os
import pandas as pd
import numpy as np

In [2]:
dataset_file = "../data/output/dataset.tsv"
prediction_dir = "../data/predictions"  # scp -r urano:/projects/CAID2/predictions .
predictions_references_file = "../data/predictions_references.tsv"

assessment_disorder_file = "../data/disorder.analysis.all.dataset.f1s.metrics.csv"
assessment_binding_file = "../data/binding.analysis.all.dataset.f1s.metrics.csv"

# Output
dataset_merge_file = "../data/output/dataset_merge.tsv"

In [3]:
# It assumes one method participate in only one challenge
df_challenge = pd.read_csv(predictions_references_file, sep="\t")
df_challenge
# df_challenge[df_challenge['Method'] == 'ENSHROUD-all']

Unnamed: 0,Method,disorder,linker,transition,binding,gene3d
0,AIUPred,0,0,1,1,1
1,AlphaFold-dis,1,1,1,0,1
2,AlphaFold-dis25,1,1,1,0,1
3,ANCHOR2,0,0,1,1,1
4,APOD,1,1,0,0,1
...,...,...,...,...,...,...
64,SETH-1,1,1,1,0,1
65,SPOT-Disorder,1,1,1,0,1
66,SPOT-Disorder-Single,1,1,1,0,1
67,SPOT-Disorder2,1,1,1,0,1


In [4]:
df_1 = pd.read_csv(assessment_disorder_file, index_col=0).reset_index().rename(columns={'index': 'method'})[['method', 'thr']]
df_1['challenge'] = 'disorder'

df_2 = pd.read_csv(assessment_binding_file, index_col=0).reset_index().rename(columns={'index': 'method'})[['method', 'thr']]
df_2['challenge'] = 'binding'
df_2

df_ = pd.concat([df_1, df_2])

df_thresholds = df_.loc[df_['method'].isin(df_challenge[df_challenge['disorder'] == 1]['Method']) & (df_['challenge'] == 'disorder')]
df_thresholds = pd.concat([df_thresholds, df_.loc[df_['method'].isin(df_challenge[df_challenge['binding'] == 1]['Method']) & (df_['challenge'] == 'binding')]])
df_thresholds['thr'].astype('float32')
df_thresholds
df_thresholds[df_thresholds['method'] == 'ENSHROUD-nucleic']

Unnamed: 0,method,thr,challenge


In [5]:
df_list = []
for pred_file in os.listdir(prediction_dir):
    pred_name = pred_file[:-5]  
    if pred_name in df_thresholds['method'].values:
#         print(pred_name)
        data = []
        with open("{}/{}".format(prediction_dir, pred_file)) as f:
            for line in f:
                if line[0] == ">":
                    disprot_id = line[1:].strip()
                else:
                    data.append([pred_name, disprot_id] + line[:-1].split("\t"))     
        df_ = pd.DataFrame(data, columns=['method', 'disprot_id', 'pos', 'aa', 'score', 'class'])
#         print(df_)
        if df_['score'].values[0] == "":
            df_['class'] = df_['class'].astype(int)
        else:
            if (df_['score'] == "").any():
                print("Empty score", pred_name)
                df_.loc[df_['score'] == "", 'score'] = np.nan
            df_['score'] = df_['score'].astype('float32')
            df_['class'] = 0
            df_.loc[df_['score'] >= df_thresholds.loc[df_thresholds['method'] == pred_name, 'thr'].iloc[0], 'class'] = 1
        df_.drop(columns=['aa', 'score'], inplace=True)
        df_list.append(df_)
    else:
        print("missing method", pred_name)

df_pred = pd.concat(df_list)    
df_pred

missing method ENSHROUD-nucleic
Empty score DisoBindPred
missing method DeepDISOBind-nucleic
missing method DeepDISOBind-protein
missing method ENSHROUD-protein


Unnamed: 0,method,disprot_id,pos,class
0,flDPtr,DP02342,1,1
1,flDPtr,DP02342,2,1
2,flDPtr,DP02342,3,1
3,flDPtr,DP02342,4,1
4,flDPtr,DP02342,5,1
...,...,...,...,...
301237,DFLpred,DP03746,1280,1
301238,DFLpred,DP03746,1281,1
301239,DFLpred,DP03746,1282,1
301240,DFLpred,DP03746,1283,1


In [6]:
# Fix bug in SETH-O position column
df_pred['pos'] = df_pred['pos'].astype(int)
df_pred.loc[df_pred['method'] == 'SETH-0', ['pos']] += 1

In [7]:
df_dataset = pd.read_csv(dataset_file, sep="\t")
df_dataset

Unnamed: 0,disprot_id,acc,pos,aa,disorder,linker,transition,nucleic acid binding,binding,protein binding,af-binding,af-disorder,af-rsa,pdb
0,DP02342,P06837,1,M,1.0,,,,,,0.887,0.270,0.897,
1,DP02342,P06837,2,L,1.0,,,,,,0.889,0.266,0.891,
2,DP02342,P06837,3,C,1.0,,,,,,0.893,0.256,0.885,
3,DP02342,P06837,4,C,1.0,,,,,,0.876,0.296,0.878,
4,DP02342,P06837,5,M,1.0,,,,,,0.878,0.292,0.873,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
297691,DP03758,Q96ST2,815,N,,,,,,,,,,
297692,DP03758,Q96ST2,816,K,,,,,,,,,,
297693,DP03758,Q96ST2,817,M,,,,,,,,,,
297694,DP03758,Q96ST2,818,P,,,,,,,,,,


In [8]:
for challenge in ["disorder", "binding"]:
    methods = list(df_challenge.loc[df_challenge[challenge] == 1]['Method'])
    
    df_mean = df_pred.loc[df_pred['method'].isin(methods),:].groupby(['disprot_id', 'pos'])['class'].mean()
    df_mean = df_mean.to_frame().rename(columns={'class': challenge + "-mean"}).reset_index()
    
    df_dataset = pd.merge(df_dataset, df_mean, left_on=['disprot_id', 'pos'], right_on=['disprot_id', 'pos'], how='left')
df_dataset

Unnamed: 0,disprot_id,acc,pos,aa,disorder,linker,transition,nucleic acid binding,binding,protein binding,af-binding,af-disorder,af-rsa,pdb,disorder-mean,binding-mean
0,DP02342,P06837,1,M,1.0,,,,,,0.887,0.270,0.897,,0.815789,0.888889
1,DP02342,P06837,2,L,1.0,,,,,,0.889,0.266,0.891,,0.763158,0.814815
2,DP02342,P06837,3,C,1.0,,,,,,0.893,0.256,0.885,,0.736842,0.814815
3,DP02342,P06837,4,C,1.0,,,,,,0.876,0.296,0.878,,0.789474,0.851852
4,DP02342,P06837,5,M,1.0,,,,,,0.878,0.292,0.873,,0.815789,0.888889
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
297691,DP03758,Q96ST2,815,N,,,,,,,,,,,,
297692,DP03758,Q96ST2,816,K,,,,,,,,,,,,
297693,DP03758,Q96ST2,817,M,,,,,,,,,,,,
297694,DP03758,Q96ST2,818,P,,,,,,,,,,,,


In [10]:
df_dataset.to_csv(dataset_merge_file, sep="\t", index=False)