# ENVs

**Comments**  
* Variables for directories always starts with '`dir_`' and ends without '`/`'
* Variables for dataframes always starts with '`df_`'

In [1]:
dir_pjs            = "/home/martingb/Projects"
# caid-reference
dir_main           = f"{dir_pjs}/2022/caid2-reference"
dir_data           = f"{dir_main}/data"
dir_data_sifts     = f"{dir_data}/sifts"
dir_data_alphafold = f"{dir_data}/alphafold"
dir_data_disprot   = f"{dir_data}/disprot"
dir_results        = f"{dir_main}/results"
dir_src            = f"{dir_main}/src"
dir_src_modules    = f"{dir_src}/modules"
dir_tmp            = f"{dir_main}/tmp"

# Imports

In [2]:
import os
import json
import requests
import pandas as pd
import numpy as np
from pprint import pprint
# For GO and IDPO terms
import networkx
import obonet
import math

# Functions

In [3]:
def expand_region(adf: pd.DataFrame, database: str = 'disprot') -> pd.DataFrame:
    """
    """
    if database == 'disprot':
        adf["reg_position"] = list(range(int(adf["start"]), int(adf["end"]) + 1, 1))
    elif database == 'sifts':
        adf["sp_position"] = list(range(int(adf["SP_BEG"]), int(adf["SP_END"]) + 1, 1))
    else:
        raise ValueError(f"'{database}' is not a choise for parameter 'database'")
    return adf

def expand_seq(adf: pd.DataFrame) -> pd.DataFrame:
    """
    """
    adf["seq_position_aa"] = [(i+1, aa) for i, aa in enumerate(adf["sequence"])]
    return adf

# Retrieve release file

The following files should be placed in <root-to-CAID2>/data/annotations`

# Notes for CAID2

* Retrieve the latest DisProt release using the API as a json file
* Build a unify DataFrame using the `<disprot-release>.json": file


## Comments
- DisProt API has problems parsing:
    + Different versions
    + Regions for
        + Namespaces
    + Consensus

## DisProt cases to test

In [4]:
disprot_testing_cases = { "DP00005": "Antitermination protein N"
                        , "DP00009": "Transcription initiation factor IIA subunit 2"
                        , "DP00016": "Cyclin-dependent kinase inhibitor 1"
                        , "DP00040": "High mobility group protein HMG-I/HMG-Y"
                        , "DP00086": "Cellular tumor antigen p53" }

## GO and IDPO terms to be used

### IDPO

In [5]:
# The OBO must have "ontology: IDPO" header (first line)
graph = obonet.read_obo(f"{dir_data_disprot}/IDPO_v0.3.0.obo")
# graph.nodes(data=True)
#df_ont = pd.DataFrame([[node[0], node[1]['name']] for node in graph.nodes(data=True) if node[1]['namespace'] == 'Disorder_function'], columns=['term', 'name'])
df_ont = pd.DataFrame([[node[0], node[1]['name']] for node in graph.nodes(data=True) if node[1]['namespace'] == 'disorder_function'], columns=['term', 'name'])
df_ont

Unnamed: 0,term,name
0,IDPO:00000,disorder function
1,IDPO:00501,entropic chain
2,IDPO:00502,flexible linker/spacer
3,IDPO:00503,flexible N-terminal tail
4,IDPO:00504,flexible C-terminal tail
5,IDPO:00505,self-regulatory activity
6,IDPO:00506,self-inhibition
7,IDPO:00507,self-activation
8,IDPO:00508,self-assembly
9,IDPO:00024,molecular recognition display site


#### Ancestors

In [6]:
# Create the ancestors table
df_depth = []
df_ont_ancestors = []
for node in graph.nodes(data=True):
    if node[1]['namespace'] == 'disorder_function':
#         print(node[0], networkx.descendants(graph, node[0]), node[1].get('is_a'))
        for d in networkx.descendants(graph, node[0]):
            df_ont_ancestors.append([node[0], d])
            df_depth.append([node[0], len(list(networkx.all_shortest_paths(graph, source=node[0], target='IDPO:00000'))[0])])
        
df_ont_ancestors = pd.DataFrame(df_ont_ancestors, columns=['term', 'ancestor']).sort_values(by='term')
df_depth = pd.DataFrame(df_depth, columns=['term', 'depth']).sort_values(by='term')
df_ont_ancestors

Unnamed: 0,term,ancestor
14,IDPO:00024,IDPO:00000
16,IDPO:00025,IDPO:00000
15,IDPO:00025,IDPO:00024
17,IDPO:00026,IDPO:00024
18,IDPO:00026,IDPO:00000
19,IDPO:00027,IDPO:00024
20,IDPO:00027,IDPO:00000
22,IDPO:00028,IDPO:00000
21,IDPO:00028,IDPO:00024
23,IDPO:00029,IDPO:00024


### GO

#### Remove edges different to `is_a`

In [7]:
# The OBO must have "ontology: GO" header (first line)
graph = obonet.read_obo(f"{dir_data_disprot}/go-basic.obo")
to_remove = []
for e in graph.edges:
    if e[2] != 'is_a':
        to_remove.append((e[0], e[1]))

for ele in to_remove:
    graph.remove_edge(*ele)


In [8]:

# for node in graph.nodes(data=True):
#     pprint(node)
#     break
# #df_ont = pd.DataFrame([[node[0], node[1]['name']] for node in graph.nodes(data=True) if node[1]['namespace'] == 'Disorder_function'], columns=['term', 'name'])
# df_ont = pd.DataFrame([[node[0], node[1]['name'], node[1]['namespace']]  for node in graph.nodes(data=True) \
#                         if (node[1]['namespace'] == 'biological_process') or \
#                            (node[1]['namespace'] == 'molecular_function') or \
#                            (node[1]['namespace'] == 'cellular_component') ], columns=['term', 'name', 'namespace'])
# df_ont

#### Ancestors

In [9]:
# # Create the ancestors table
# df_depth = []
# df_ont_ancestors = []
# for node in graph.nodes(data=True): #GO:0005488 binding
#     #if node[1]['namespace'] == 'binding':
#     if node[1]['namespace'] == 'molecular_function':
# #         print(node[0], networkx.descendants(graph, node[0]), node[1].get('is_a'))
#         for d in networkx.descendants(graph, node[0]):
#             df_ont_ancestors.append([node[0], d])
#             df_depth.append([node[0], len(list(networkx.all_shortest_paths(graph, source=node[0], target='GO:0003674'))[0])])
        
# df_ont_ancestors = pd.DataFrame(df_ont_ancestors, columns=['term', 'ancestor']).sort_values(by='term')
# df_depth = pd.DataFrame(df_depth, columns=['term', 'depth']).sort_values(by='term')
# df_ont_ancestors[df_ont_ancestors["term"] == "GO:0005488"]

In [10]:
# df_ont_ancestors[df_ont_ancestors["term"] == "GO:0002151"]

#### Children

In [11]:
# Parents
challenge_ancestors = [ ('GO:0005488', 'binding')
                      , ('GO:0003676', 'nucleic acid binding')
                      , ('GO:0005515', 'protein binding') ]
# Create children table
challenges_go = []
for go_term, challenge in challenge_ancestors:
    for node in graph.nodes(data=True):
        if node[0] == go_term:
            challenges_go.append([go_term, challenge])
            for d in networkx.ancestors(graph, node[0]): 
                challenges_go.append([d, challenge])

#### Ontology terms for challanges

In [12]:
# Main GO terms
# { "GO:0008150": "biological_process"
# , "GO:0003674": "molecular_function"
# , "GO:0005575": "cellular_component" }

challenges_idpo = [ ('IDPO:00076', 'disorder')
                  , ('IDPO:00077', 'disorder')
                  , ('IDPO:00078', 'disorder')
                  , ('IDPO:00501', 'linker')
                  , ('IDPO:00502', 'linker')
                  , ('IDPO:00503', 'linker')
                  , ('IDPO:00504', 'linker')
                  , ('IDPO:00049', 'transition')
                  , ('IDPO:00050', 'transition')
                  , ('IDPO:00051', 'transition')
                  , ('IDPO:00052', 'transition')
                  , ('IDPO:00053', 'transition')
                  , ('IDPO:00060', 'transition')
                  , ('IDPO:00055', 'transition')
                  , ('IDPO:00056', 'transition')
                  , ('IDPO:00061', 'transition')
                  , ('IDPO:00054', 'transition')
                  , ('IDPO:00057', 'transition')
                  , ('IDPO:00058', 'transition')
                  , ('IDPO:00059', 'transition')]

# challenges_go = list(zip( df_ont_children['child']
#                         , df_ont_children['challenge']))

challenges = challenges_go + challenges_idpo
challenges

[['GO:0005488', 'binding'],
 ['GO:0020037', 'binding'],
 ['GO:0016597', 'binding'],
 ['GO:0005114', 'binding'],
 ['GO:0098851', 'binding'],
 ['GO:1904047', 'binding'],
 ['GO:0030987', 'binding'],
 ['GO:0016168', 'binding'],
 ['GO:0033418', 'binding'],
 ['GO:0043515', 'binding'],
 ['GO:0062067', 'binding'],
 ['GO:0031828', 'binding'],
 ['GO:0045238', 'binding'],
 ['GO:0097247', 'binding'],
 ['GO:0140703', 'binding'],
 ['GO:0098750', 'binding'],
 ['GO:0010997', 'binding'],
 ['GO:0000976', 'binding'],
 ['GO:0003727', 'binding'],
 ['GO:0061676', 'binding'],
 ['GO:0005527', 'binding'],
 ['GO:0070644', 'binding'],
 ['GO:0110036', 'binding'],
 ['GO:0023027', 'binding'],
 ['GO:0001018', 'binding'],
 ['GO:0031434', 'binding'],
 ['GO:0008084', 'binding'],
 ['GO:1990782', 'binding'],
 ['GO:0050681', 'binding'],
 ['GO:0033458', 'binding'],
 ['GO:0046332', 'binding'],
 ['GO:0044378', 'binding'],
 ['GO:0000340', 'binding'],
 ['GO:0070181', 'binding'],
 ['GO:0120170', 'binding'],
 ['GO:0045027', 'bin

In [13]:
# Be aware that the column name for `term_id` is written correctly.
df_challenges = pd.DataFrame(data=challenges, columns=['term_id', 'challenge'])
df_challenges

Unnamed: 0,term_id,challenge
0,GO:0005488,binding
1,GO:0020037,binding
2,GO:0016597,binding
3,GO:0005114,binding
4,GO:0098851,binding
...,...,...
3082,IDPO:00061,transition
3083,IDPO:00054,transition
3084,IDPO:00057,transition
3085,IDPO:00058,transition


## CAID2 entries

In [14]:
main_columns = [ 'disprot_id'
               , 'acc'
               , 'name'
               , 'ncbi_taxon_id'           # int
               , 'organism'
               , 'sequence'
               , 'taxonomy' ]              # list of str (organisms)

region_columns = [ "disprot_namespace"  # str
                 , "region_id"          # str
                 , "date"               # str
                 , "start"              # int
                 , "end"                # int
                 , "term_id"            # str (GO and IDPO terms)
                 , "term_name"          # str (GO and IDPO description)
                 , "term_namespace"     # str (GO and IDPO namespace)
                 , "term_ontology" ]    # str (type of ontology, e.i.: GO, IDPO)

# """
# Ontology
# DisProt relies on three different ontologies to annotate intrinsically disordered regions, the Intrinsically Disordered Proteins Ontology (IDPO), the Gene Ontology (GO) and the Evidence and Conclusion Ontology (ECO).
#     * IDPO is used to describe structural aspects of an IDP/IDR, self-functions and functions directly associated with their disordered state.
#     * GO is used to describe functional aspects of an IDP/IDR.
#     * ECO describes the technique or evidence associated with an annotation.
# """

### Getting private DisProt entries

#### Old DisProt json

In [15]:
json_old = []
with open(f"{dir_data_disprot}/entries_2022_06.json", "r") as f:
    for line in f:
        json_old.append(json.loads(line))

#### New DisProt json

In [16]:
json_new = []
with open(f"{dir_data_disprot}/entries_2022_06_c.json", "r") as f:
    for line in f:
        json_new.append(json.loads(line))

#### Private DisProt json

In [35]:
json_private = []
entries_old = {k['disprot_id'] for k in json_old}
for ajson in json_new:
    if (not ajson['disprot_id'] in entries_old) and \
       (len(ajson['regions']) > 0)              and \
       (not 'obsolete' in ajson.keys())         and \
       (not 'X' in ajson['sequence']): # without non-standard aa
        json_private.append(ajson)
        entries_old.discard(ajson['disprot_id'])
len(json_private)

362

## Dataframes

### Dataframe for private DisProt entries

In [18]:
df_disprot_private = pd.json_normalize( data          = json_private
                                      , record_path   = ['regions']
                                      , meta          = main_columns
                                      , meta_prefix   = ''
                                      , record_prefix = '' )
df_disprot_private = df_disprot_private.loc[:, main_columns + region_columns]
df_disprot_private

Unnamed: 0,disprot_id,acc,name,ncbi_taxon_id,organism,sequence,taxonomy,disprot_namespace,region_id,date,start,end,term_id,term_name,term_namespace,term_ontology
0,DP02342,P06837,Neuromodulin,10090,Mus musculus,MLCCMRRTKQVEKNDEDQKIEQDGVKPEDKAHKAATKIQASFRGHI...,"[Eukaryota, Metazoa, Chordata, Craniata, Verte...",Structural state,DP02342r003,2022-02-14T09:00:00.000Z,1,227,IDPO:00078,pre-molten globule,Structural state,IDPO
1,DP02342,P06837,Neuromodulin,10090,Mus musculus,MLCCMRRTKQVEKNDEDQKIEQDGVKPEDKAHKAATKIQASFRGHI...,"[Eukaryota, Metazoa, Chordata, Craniata, Verte...",Disorder function,DP02342r006,2022-02-14T09:00:00.000Z,34,57,GO:0051179,localization,Biological process,GO
2,DP02342,P06837,Neuromodulin,10090,Mus musculus,MLCCMRRTKQVEKNDEDQKIEQDGVKPEDKAHKAATKIQASFRGHI...,"[Eukaryota, Metazoa, Chordata, Craniata, Verte...",Disorder function,DP02342r007,2022-02-14T09:00:00.000Z,34,57,GO:0098772,molecular function regulator,Molecular function,GO
3,DP02342,P06837,Neuromodulin,10090,Mus musculus,MLCCMRRTKQVEKNDEDQKIEQDGVKPEDKAHKAATKIQASFRGHI...,"[Eukaryota, Metazoa, Chordata, Craniata, Verte...",Structural state,DP02342r009,2022-02-14T09:00:00.000Z,1,227,IDPO:00078,pre-molten globule,Structural state,IDPO
4,DP02342,P06837,Neuromodulin,10090,Mus musculus,MLCCMRRTKQVEKNDEDQKIEQDGVKPEDKAHKAATKIQASFRGHI...,"[Eukaryota, Metazoa, Chordata, Craniata, Verte...",Disorder function,DP02342r010,2022-02-14T09:00:00.000Z,34,57,GO:0005515,protein binding,Molecular function,GO
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1216,DP03744,Q9VVJ7,CG7484 protein,7227,Drosophila melanogaster,MHKCAIFLLLALSCQQIQAELTAADCRALGFIKAQLMCSSCEKLDD...,"Eukaryota, Metazoa, Ecdysozoa, Arthropoda, Hex...",Disorder function,DP03744r002,2022-06-17T10:48:39.874Z,53,178,GO:0045454,cell redox homeostasis,Biological process,GO
1217,DP03745,Q8VHC3,Selenoprotein M,10090,Mus musculus,MSILLSPPSLLLLLAALVAPATSTTNYRPDWNRLRGLARGRVETCG...,"Eukaryota, Metazoa, Chordata, Craniata, Verteb...",Structural state,DP03745r001,2022-06-17T10:51:32.225Z,25,34,IDPO:00076,disorder,Structural state,IDPO
1218,DP03745,Q8VHC3,Selenoprotein M,10090,Mus musculus,MSILLSPPSLLLLLAALVAPATSTTNYRPDWNRLRGLARGRVETCG...,"Eukaryota, Metazoa, Chordata, Craniata, Verteb...",Structural state,DP03745r002,2022-06-17T10:51:44.007Z,121,145,IDPO:00076,disorder,Structural state,IDPO
1219,DP03745,Q8VHC3,Selenoprotein M,10090,Mus musculus,MSILLSPPSLLLLLAALVAPATSTTNYRPDWNRLRGLARGRVETCG...,"Eukaryota, Metazoa, Chordata, Craniata, Verteb...",Disorder function,DP03745r003,2022-06-17T10:52:51.196Z,24,145,GO:0045454,cell redox homeostasis,Biological process,GO


##### Getting private DisProt entries IDs

In [19]:
set_disprot_ids_private = set(df_disprot_private["disprot_id"].tolist())
len(set_disprot_ids_private)

365

##### Manually testing if a disprot entry from july 2022 exists

In [20]:
'DP03753' in set_disprot_ids_private or 'DP03753' in df_disprot_private['disprot_id'].tolist()

False

### Dataframe for sequence

In [21]:
df_sequence = df_disprot_private[['disprot_id', 'acc', 'sequence']].copy(deep=True).drop_duplicates()
df_sequence['seq_position_aa'] = df_disprot_private['sequence'].apply(lambda x: [(i+1, aa) for i, aa in enumerate(x)])
df_sequence = df_sequence.explode("seq_position_aa")
df_sequence[['seq_position', 'seq_aa']] = pd.DataFrame(df_sequence['seq_position_aa'].tolist(), index=df_sequence.index)
df_sequence.drop(columns=['seq_position_aa', 'sequence'], inplace=True)
df_sequence


Unnamed: 0,disprot_id,acc,seq_position,seq_aa
0,DP02342,P06837,1,M
0,DP02342,P06837,2,L
0,DP02342,P06837,3,C
0,DP02342,P06837,4,C
0,DP02342,P06837,5,M
...,...,...,...,...
1220,DP03746,Q9QUH6-2,1280,Q
1220,DP03746,Q9QUH6-2,1281,L
1220,DP03746,Q9QUH6-2,1282,L
1220,DP03746,Q9QUH6-2,1283,I


### Dataframe for regions

In [22]:
df_regions = df_disprot_private.apply(expand_region, database='disprot', axis=1).copy(deep=True)
df_regions = df_regions.explode("reg_position")
df_regions

Unnamed: 0,disprot_id,acc,name,ncbi_taxon_id,organism,sequence,taxonomy,disprot_namespace,region_id,date,start,end,term_id,term_name,term_namespace,term_ontology,reg_position
0,DP02342,P06837,Neuromodulin,10090,Mus musculus,MLCCMRRTKQVEKNDEDQKIEQDGVKPEDKAHKAATKIQASFRGHI...,"[Eukaryota, Metazoa, Chordata, Craniata, Verte...",Structural state,DP02342r003,2022-02-14T09:00:00.000Z,1,227,IDPO:00078,pre-molten globule,Structural state,IDPO,1
0,DP02342,P06837,Neuromodulin,10090,Mus musculus,MLCCMRRTKQVEKNDEDQKIEQDGVKPEDKAHKAATKIQASFRGHI...,"[Eukaryota, Metazoa, Chordata, Craniata, Verte...",Structural state,DP02342r003,2022-02-14T09:00:00.000Z,1,227,IDPO:00078,pre-molten globule,Structural state,IDPO,2
0,DP02342,P06837,Neuromodulin,10090,Mus musculus,MLCCMRRTKQVEKNDEDQKIEQDGVKPEDKAHKAATKIQASFRGHI...,"[Eukaryota, Metazoa, Chordata, Craniata, Verte...",Structural state,DP02342r003,2022-02-14T09:00:00.000Z,1,227,IDPO:00078,pre-molten globule,Structural state,IDPO,3
0,DP02342,P06837,Neuromodulin,10090,Mus musculus,MLCCMRRTKQVEKNDEDQKIEQDGVKPEDKAHKAATKIQASFRGHI...,"[Eukaryota, Metazoa, Chordata, Craniata, Verte...",Structural state,DP02342r003,2022-02-14T09:00:00.000Z,1,227,IDPO:00078,pre-molten globule,Structural state,IDPO,4
0,DP02342,P06837,Neuromodulin,10090,Mus musculus,MLCCMRRTKQVEKNDEDQKIEQDGVKPEDKAHKAATKIQASFRGHI...,"[Eukaryota, Metazoa, Chordata, Craniata, Verte...",Structural state,DP02342r003,2022-02-14T09:00:00.000Z,1,227,IDPO:00078,pre-molten globule,Structural state,IDPO,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1220,DP03746,Q9QUH6-2,Isoform 2 of Ras/Rap GTPase-activating protein...,10116,Rattus norvegicus,MSYAPFRDVRGPPMHRTQYVHSPYDRPGWNPRFCIISGNQLLMLDE...,"Eukaryota, Metazoa, Chordata, Craniata, Verteb...",Structural state,DP03746r001,2022-06-22T14:35:20.554Z,350,386,IDPO:00076,disorder,Structural state,IDPO,382
1220,DP03746,Q9QUH6-2,Isoform 2 of Ras/Rap GTPase-activating protein...,10116,Rattus norvegicus,MSYAPFRDVRGPPMHRTQYVHSPYDRPGWNPRFCIISGNQLLMLDE...,"Eukaryota, Metazoa, Chordata, Craniata, Verteb...",Structural state,DP03746r001,2022-06-22T14:35:20.554Z,350,386,IDPO:00076,disorder,Structural state,IDPO,383
1220,DP03746,Q9QUH6-2,Isoform 2 of Ras/Rap GTPase-activating protein...,10116,Rattus norvegicus,MSYAPFRDVRGPPMHRTQYVHSPYDRPGWNPRFCIISGNQLLMLDE...,"Eukaryota, Metazoa, Chordata, Craniata, Verteb...",Structural state,DP03746r001,2022-06-22T14:35:20.554Z,350,386,IDPO:00076,disorder,Structural state,IDPO,384
1220,DP03746,Q9QUH6-2,Isoform 2 of Ras/Rap GTPase-activating protein...,10116,Rattus norvegicus,MSYAPFRDVRGPPMHRTQYVHSPYDRPGWNPRFCIISGNQLLMLDE...,"Eukaryota, Metazoa, Chordata, Craniata, Verteb...",Structural state,DP03746r001,2022-06-22T14:35:20.554Z,350,386,IDPO:00076,disorder,Structural state,IDPO,385


In [23]:

df_regions = pd.merge( left  = df_regions
                     , right = df_challenges
                     , how   = "left"
                     , on    = "term_id" )


df_regions

Unnamed: 0,disprot_id,acc,name,ncbi_taxon_id,organism,sequence,taxonomy,disprot_namespace,region_id,date,start,end,term_id,term_name,term_namespace,term_ontology,reg_position,challenge
0,DP02342,P06837,Neuromodulin,10090,Mus musculus,MLCCMRRTKQVEKNDEDQKIEQDGVKPEDKAHKAATKIQASFRGHI...,"[Eukaryota, Metazoa, Chordata, Craniata, Verte...",Structural state,DP02342r003,2022-02-14T09:00:00.000Z,1,227,IDPO:00078,pre-molten globule,Structural state,IDPO,1,disorder
1,DP02342,P06837,Neuromodulin,10090,Mus musculus,MLCCMRRTKQVEKNDEDQKIEQDGVKPEDKAHKAATKIQASFRGHI...,"[Eukaryota, Metazoa, Chordata, Craniata, Verte...",Structural state,DP02342r003,2022-02-14T09:00:00.000Z,1,227,IDPO:00078,pre-molten globule,Structural state,IDPO,2,disorder
2,DP02342,P06837,Neuromodulin,10090,Mus musculus,MLCCMRRTKQVEKNDEDQKIEQDGVKPEDKAHKAATKIQASFRGHI...,"[Eukaryota, Metazoa, Chordata, Craniata, Verte...",Structural state,DP02342r003,2022-02-14T09:00:00.000Z,1,227,IDPO:00078,pre-molten globule,Structural state,IDPO,3,disorder
3,DP02342,P06837,Neuromodulin,10090,Mus musculus,MLCCMRRTKQVEKNDEDQKIEQDGVKPEDKAHKAATKIQASFRGHI...,"[Eukaryota, Metazoa, Chordata, Craniata, Verte...",Structural state,DP02342r003,2022-02-14T09:00:00.000Z,1,227,IDPO:00078,pre-molten globule,Structural state,IDPO,4,disorder
4,DP02342,P06837,Neuromodulin,10090,Mus musculus,MLCCMRRTKQVEKNDEDQKIEQDGVKPEDKAHKAATKIQASFRGHI...,"[Eukaryota, Metazoa, Chordata, Craniata, Verte...",Structural state,DP02342r003,2022-02-14T09:00:00.000Z,1,227,IDPO:00078,pre-molten globule,Structural state,IDPO,5,disorder
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
140295,DP03746,Q9QUH6-2,Isoform 2 of Ras/Rap GTPase-activating protein...,10116,Rattus norvegicus,MSYAPFRDVRGPPMHRTQYVHSPYDRPGWNPRFCIISGNQLLMLDE...,"Eukaryota, Metazoa, Chordata, Craniata, Verteb...",Structural state,DP03746r001,2022-06-22T14:35:20.554Z,350,386,IDPO:00076,disorder,Structural state,IDPO,382,disorder
140296,DP03746,Q9QUH6-2,Isoform 2 of Ras/Rap GTPase-activating protein...,10116,Rattus norvegicus,MSYAPFRDVRGPPMHRTQYVHSPYDRPGWNPRFCIISGNQLLMLDE...,"Eukaryota, Metazoa, Chordata, Craniata, Verteb...",Structural state,DP03746r001,2022-06-22T14:35:20.554Z,350,386,IDPO:00076,disorder,Structural state,IDPO,383,disorder
140297,DP03746,Q9QUH6-2,Isoform 2 of Ras/Rap GTPase-activating protein...,10116,Rattus norvegicus,MSYAPFRDVRGPPMHRTQYVHSPYDRPGWNPRFCIISGNQLLMLDE...,"Eukaryota, Metazoa, Chordata, Craniata, Verteb...",Structural state,DP03746r001,2022-06-22T14:35:20.554Z,350,386,IDPO:00076,disorder,Structural state,IDPO,384,disorder
140298,DP03746,Q9QUH6-2,Isoform 2 of Ras/Rap GTPase-activating protein...,10116,Rattus norvegicus,MSYAPFRDVRGPPMHRTQYVHSPYDRPGWNPRFCIISGNQLLMLDE...,"Eukaryota, Metazoa, Chordata, Craniata, Verteb...",Structural state,DP03746r001,2022-06-22T14:35:20.554Z,350,386,IDPO:00076,disorder,Structural state,IDPO,385,disorder


In [24]:
df_regions = df_regions[['disprot_id', 'acc', 'challenge', 'reg_position']]
df_regions.dropna(subset='challenge', inplace=True)
df_regions.drop_duplicates(['disprot_id', 'acc', 'challenge', 'reg_position'], inplace=True)
df_regions

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_regions.dropna(subset='challenge', inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_regions.drop_duplicates(['disprot_id', 'acc', 'challenge', 'reg_position'], inplace=True)


Unnamed: 0,disprot_id,acc,challenge,reg_position
0,DP02342,P06837,disorder,1
1,DP02342,P06837,disorder,2
2,DP02342,P06837,disorder,3
3,DP02342,P06837,disorder,4
4,DP02342,P06837,disorder,5
...,...,...,...,...
140295,DP03746,Q9QUH6-2,disorder,382
140296,DP03746,Q9QUH6-2,disorder,383
140297,DP03746,Q9QUH6-2,disorder,384
140298,DP03746,Q9QUH6-2,disorder,385


### SIFTS dataframe

In [25]:
df_sifts = pd.read_csv(f"{dir_data_sifts}/uniprot_segments_observed.tsv.gz", sep="\t", header=1)
df_sifts

Unnamed: 0,PDB,CHAIN,SP_PRIMARY,RES_BEG,RES_END,PDB_BEG,PDB_END,SP_BEG,SP_END
0,113l,A,P00720,1,162,1,162,1,162
1,11gs,A,P09211,3,210,2,209,3,210
2,11gs,B,P09211,3,210,2,209,3,210
3,121p,A,P01112,1,166,1,166,1,166
4,133l,A,P61626,1,130,1,130,19,148
...,...,...,...,...,...,...,...,...,...
962405,7z7o,C,B1PNC0,69,227,71,229,61,219
962406,7z7o,D,B1PNC0,13,67,13,67,3,57
962407,7z7o,D,B1PNC0,68,68,69,69,59,60
962408,7z7o,D,B1PNC0,68,68,69,69,58,58


#### Subsetting dataframe to have only present disprot sequences

In [26]:
df_sifts = df_sifts.loc[df_sifts['SP_PRIMARY'].isin(df_sequence['acc'].unique().tolist())]
df_sifts

Unnamed: 0,PDB,CHAIN,SP_PRIMARY,RES_BEG,RES_END,PDB_BEG,PDB_END,SP_BEG,SP_END
1082,1h9e,A,P42166,1,56,1,56,2,57
1307,1ig8,A,P04807,18,486,18,486,18,486
1649,1k99,A,P17480,2,91,2,91,103,192
1960,1m7k,A,O95429,19,99,376,456,376,456
2382,1olt,A,P32131,4,13,4,13,4,13
...,...,...,...,...,...,...,...,...,...
961229,7vjy,A,P0DTC1,1,306,1,306,3264,3569
961312,7vk3,A,P0DTC1,1,300,1,300,3264,3563
961313,7vk3,B,P0DTC1,3,301,3,301,3266,3564
961328,7vk7,A,P0DTC1,1,300,1,300,3264,3563


In [27]:
df_sifts = df_sifts.apply(expand_region, database='sifts', axis=1)
df_sifts = df_sifts.explode("sp_position")
df_sifts

Unnamed: 0,PDB,CHAIN,SP_PRIMARY,RES_BEG,RES_END,PDB_BEG,PDB_END,SP_BEG,SP_END,sp_position
1082,1h9e,A,P42166,1,56,1,56,2,57,2
1082,1h9e,A,P42166,1,56,1,56,2,57,3
1082,1h9e,A,P42166,1,56,1,56,2,57,4
1082,1h9e,A,P42166,1,56,1,56,2,57,5
1082,1h9e,A,P42166,1,56,1,56,2,57,6
...,...,...,...,...,...,...,...,...,...,...
961329,7vk7,B,P0DTC1,3,301,3,301,3266,3564,3560
961329,7vk7,B,P0DTC1,3,301,3,301,3266,3564,3561
961329,7vk7,B,P0DTC1,3,301,3,301,3266,3564,3562
961329,7vk7,B,P0DTC1,3,301,3,301,3266,3564,3563


In [28]:
# Add wwPDB observed residues
# Suposition: aa is the same in uniprot and wwPDB
df_sequence_sifts = pd.merge( left     = df_sequence
                            , right    = df_sifts
                            , left_on  = ['acc', 'seq_position']
                            , right_on = ['SP_PRIMARY', 'sp_position']
                            , how      = 'left' )
df_sequence_sifts

Unnamed: 0,disprot_id,acc,seq_position,seq_aa,PDB,CHAIN,SP_PRIMARY,RES_BEG,RES_END,PDB_BEG,PDB_END,SP_BEG,SP_END,sp_position
0,DP02342,P06837,1,M,,,,,,,,,,
1,DP02342,P06837,2,L,,,,,,,,,,
2,DP02342,P06837,3,C,,,,,,,,,,
3,DP02342,P06837,4,C,,,,,,,,,,
4,DP02342,P06837,5,M,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1392837,DP03746,Q9QUH6-2,1280,Q,,,,,,,,,,
1392838,DP03746,Q9QUH6-2,1281,L,,,,,,,,,,
1392839,DP03746,Q9QUH6-2,1282,L,,,,,,,,,,
1392840,DP03746,Q9QUH6-2,1283,I,,,,,,,,,,


In [29]:
df_sequence_sifts = df_sequence_sifts[['disprot_id', 'acc', 'seq_position', 'seq_aa', 'sp_position']]
df_sequence_sifts.drop_duplicates(inplace=True)
df_sequence_sifts

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_sequence_sifts.drop_duplicates(inplace=True)


Unnamed: 0,disprot_id,acc,seq_position,seq_aa,sp_position
0,DP02342,P06837,1,M,
1,DP02342,P06837,2,L,
2,DP02342,P06837,3,C,
3,DP02342,P06837,4,C,
4,DP02342,P06837,5,M,
...,...,...,...,...,...
1392837,DP03746,Q9QUH6-2,1280,Q,
1392838,DP03746,Q9QUH6-2,1281,L,
1392839,DP03746,Q9QUH6-2,1282,L,
1392840,DP03746,Q9QUH6-2,1283,I,


### Alphafold dataframe

In [30]:
df_af = pd.DataFrame()
header = [ 'name'
         , 'pos'
         , 'aa'
         , 'lddt'
         , 'disorder'
         , 'rsa'
         , 'ss'
         , 'disorder-25'
         , 'binding-25-0.581' ]
for af_file in os.listdir(f"{dir_data_alphafold}"):
    af_pred = pd.read_csv(f"{dir_data_alphafold}/{af_file}", sep='\t')
    df_af   = pd.concat([df_af, af_pred], ignore_index=True)
df_af['name'] = df_af['name'].apply(lambda x: x.replace('AF-','').replace('-F1-model_v3',''))
df_af

Unnamed: 0,name,pos,aa,lddt,disorder,rsa,ss,disorder-25,binding-25-0.581
0,P91870,1,M,0.485,0.515,1.000,-,0.866,0.784
1,P91870,2,E,0.575,0.425,0.928,-,0.872,0.822
2,P91870,3,D,0.608,0.392,0.890,-,0.876,0.836
3,P91870,4,D,0.578,0.422,0.988,-,0.879,0.823
4,P91870,5,A,0.626,0.374,0.811,-,0.888,0.843
...,...,...,...,...,...,...,...,...,...
218152,A1KVD0,151,D,0.971,0.029,0.454,H,0.415,0.415
218153,A1KVD0,152,N,0.958,0.042,0.745,H,0.422,0.422
218154,A1KVD0,153,A,0.930,0.070,0.236,T,0.447,0.447
218155,A1KVD0,154,R,0.893,0.107,0.476,T,0.462,0.462


In [31]:
# Add Alphafold predictions
df_sequence_af = pd.merge( left     = df_sequence
                         , right    = df_af
                         , left_on  = ['acc', 'seq_position']
                         , right_on = ['name', 'pos']
                         , how      = 'left' )
print(len(df_sequence_af.disprot_id.unique().tolist()))
df_sequence_af

365


Unnamed: 0,disprot_id,acc,seq_position,seq_aa,name,pos,aa,lddt,disorder,rsa,ss,disorder-25,binding-25-0.581
0,DP02342,P06837,1,M,P06837,1.0,M,0.730,0.270,1.000,-,0.897,0.887
1,DP02342,P06837,2,L,P06837,2.0,L,0.734,0.266,0.994,G,0.891,0.889
2,DP02342,P06837,3,C,P06837,3.0,C,0.744,0.256,0.919,G,0.885,0.893
3,DP02342,P06837,4,C,P06837,4.0,C,0.704,0.296,0.874,G,0.878,0.876
4,DP02342,P06837,5,M,P06837,5.0,M,0.708,0.292,0.888,G,0.873,0.878
...,...,...,...,...,...,...,...,...,...,...,...,...,...
306255,DP03746,Q9QUH6-2,1280,Q,,,,,,,,,
306256,DP03746,Q9QUH6-2,1281,L,,,,,,,,,
306257,DP03746,Q9QUH6-2,1282,L,,,,,,,,,
306258,DP03746,Q9QUH6-2,1283,I,,,,,,,,,


In [32]:
df_sequence_af = df_sequence_af[['disprot_id', 'acc', 'seq_position', 'seq_aa', 'lddt']]
df_sequence_af.drop_duplicates(inplace=True)
df_sequence_af

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_sequence_af.drop_duplicates(inplace=True)


Unnamed: 0,disprot_id,acc,seq_position,seq_aa,lddt
0,DP02342,P06837,1,M,0.730
1,DP02342,P06837,2,L,0.734
2,DP02342,P06837,3,C,0.744
3,DP02342,P06837,4,C,0.704
4,DP02342,P06837,5,M,0.708
...,...,...,...,...,...
306255,DP03746,Q9QUH6-2,1280,Q,
306256,DP03746,Q9QUH6-2,1281,L,
306257,DP03746,Q9QUH6-2,1282,L,
306258,DP03746,Q9QUH6-2,1283,I,


## References

In [33]:
for challenge in ['disorder', 'linker', 'transition', 'binding', 'nucleic acid binding', 'protein binding']:
    # Challenge
    df_entry_region_challenge = pd.merge( left     = df_sequence
                                        , right    = df_regions.loc[df_regions['challenge'] == challenge]
                                        , left_on  = ["disprot_id", 'acc', 'seq_position']
                                        , right_on = ["disprot_id", 'acc', 'reg_position']
                                        , how      = 'left' )
    with open(f"{dir_results}/{challenge.replace(' ', '_')}.fasta", "w+") as f:
        for disprot_id, df_g in df_entry_region_challenge.groupby(by='disprot_id'):
            f.write(">{}\n{}\n{}\n".format( disprot_id
                                          , ''.join(df_g['seq_aa'])
                                          , ''.join(['0' if pd.isnull(val) else '1' for val in df_g['challenge']]) ))
    # Challange + wwPDB (Sifts)
    df_sequence_sifts_challenge = pd.merge( left     = df_sequence_sifts
                                          , right    = df_entry_region_challenge
                                          , left_on  = ["disprot_id", 'seq_aa', 'acc', 'seq_position', 'sp_position']
                                          , right_on = ["disprot_id", 'seq_aa', 'acc', 'seq_position', 'reg_position']
                                          , how      = 'left' )
    df_sequence_sifts_challenge['result'] = '-'
    df_sequence_sifts_challenge.loc[df_sequence_sifts_challenge['sp_position'].notnull(), 'result'] = '0' 
    df_sequence_sifts_challenge.loc[df_sequence_sifts_challenge['challenge'].notnull(), 'result'] = '1' # Assigment: 1 for challenge
    with open(f"{dir_results}/pdb-{challenge.replace(' ', '_')}-atleast.fasta", "w+") as f:
        for disprot_id, df_g in df_sequence_sifts_challenge.groupby(by='disprot_id'):
            f.write(">{}\n{}\n{}\n".format( disprot_id
                                          , ''.join(df_g['seq_aa'])
                                          , ''.join(df_g['result'])))
    # Challange + Alphafold
    df_sequence_af_challenge = pd.merge( left     = df_sequence_af
                                       , right    = df_regions.loc[df_regions['challenge'] == challenge]
                                       , left_on  = ["disprot_id", 'acc', 'seq_position']
                                       , right_on = ["disprot_id", 'acc', 'reg_position']
                                       , how      = 'left' )
    df_sequence_af_challenge['result'] = '-'
    df_sequence_af_challenge.loc[df_sequence_af_challenge['lddt'] > 0.7, 'result'] = '0'  # Assigment: 0 for AF order
    df_sequence_af_challenge.loc[df_sequence_af_challenge['challenge'].notnull(), 'result'] = '1' # Assigment: 1 for challenge
    with open(f"{dir_results}/af-{challenge.replace(' ', '_')}-atleast.fasta", "w+") as f:
        for disprot_id, df_g in df_sequence_af_challenge.groupby(by='disprot_id'):
            f.write(">{}\n{}\n{}\n".format( disprot_id
                                        , ''.join(df_g['seq_aa'])
                                        , ''.join(df_g['result'])))

## Baselines

In [34]:
# # wwPDB (Sifts) - CAID format
with open(f"{dir_results}/pdb-atleast.fasta", "w+") as f:
    for disprot_id, df_g in df_sequence_sifts.groupby(by='disprot_id'):
        f.write(">{}\n{}\n{}\n".format( disprot_id
                                      , ''.join(df_g['seq_aa'])
                                      , ''.join(['0' if pd.isnull(val) else '1' for val in df_g['sp_position']]))) # Assigment: 0 for disorder