# ENVs

**Comments**  
* Variables for directories always starts with '`dir_`' and ends without '`/`'
* Variables for dataframes always starts with '`df_`'

In [1]:
dir_pjs         = "/home/martingb/Projects"
# caid-reference
dir_main         = f"{dir_pjs}/2022/caid2-reference"
dir_data         = f"{dir_main}/data"
dir_data_sifts   = f"{dir_data}/sifts"
dir_data_disprot = f"{dir_data}/disprot"
dir_results      = f"{dir_main}/results"
dir_src          = f"{dir_main}/src"
dir_src_modules  = f"{dir_src}/modules"
dir_tmp          = f"{dir_main}/tmp"

## f-string variables

In [2]:
nl  = "\n"
tab = "\t"

# Imports

In [3]:
import os
import json
import requests
import pandas as pd
import numpy as np
from pprint import pprint
# For GO and IDPO terms
import networkx
import obonet
import math

# Functions

# Retrieve release file

The following files should be placed in <root-to-CAID2>/data/annotations`

### Retrieve `disprot-2022_06-all.json`

## Retrieve annotations for CAID2

# Notes for CAID2

* Retrieve the latest DisProt release using the API as a json file
* Build a unify DataFrame using the `<disprot-release>.json": file


## Comments
- DisProt API has problems parsing:
    + Different versions
    + Regions for
        + Namespaces
    + Consensus

## DisProt cases to test

In [4]:
disprot_testing_cases = { "DP00005": "Antitermination protein N"
                        , "DP00009": "Transcription initiation factor IIA subunit 2"
                        , "DP00016": "Cyclin-dependent kinase inhibitor 1"
                        , "DP00040": "High mobility group protein HMG-I/HMG-Y"
                        , "DP00086": "Cellular tumor antigen p53" }

## GO and IDPO terms to be used

### IDPO

In [5]:
# The OBO must have "ontology: IDPO" header (first line)
graph = obonet.read_obo(f"{dir_data_disprot}/IDPO_v0.3.0.obo")
# graph.nodes(data=True)
#df_ont = pd.DataFrame([[node[0], node[1]['name']] for node in graph.nodes(data=True) if node[1]['namespace'] == 'Disorder_function'], columns=['term', 'name'])
df_ont = pd.DataFrame([[node[0], node[1]['name']] for node in graph.nodes(data=True) if node[1]['namespace'] == 'disorder_function'], columns=['term', 'name'])
df_ont

Unnamed: 0,term,name
0,IDPO:00000,disorder function
1,IDPO:00501,entropic chain
2,IDPO:00502,flexible linker/spacer
3,IDPO:00503,flexible N-terminal tail
4,IDPO:00504,flexible C-terminal tail
5,IDPO:00505,self-regulatory activity
6,IDPO:00506,self-inhibition
7,IDPO:00507,self-activation
8,IDPO:00508,self-assembly
9,IDPO:00024,molecular recognition display site


#### Ancestors

In [6]:
# Create the ancestors table
df_depth = []
df_ont_ancestors = []
for node in graph.nodes(data=True):
    if node[1]['namespace'] == 'disorder_function':
#         print(node[0], networkx.descendants(graph, node[0]), node[1].get('is_a'))
        for d in networkx.descendants(graph, node[0]):
            df_ont_ancestors.append([node[0], d])
            df_depth.append([node[0], len(list(networkx.all_shortest_paths(graph, source=node[0], target='IDPO:00000'))[0])])
        
df_ont_ancestors = pd.DataFrame(df_ont_ancestors, columns=['term', 'ancestor']).sort_values(by='term')
df_depth = pd.DataFrame(df_depth, columns=['term', 'depth']).sort_values(by='term')
df_ont_ancestors

Unnamed: 0,term,ancestor
14,IDPO:00024,IDPO:00000
16,IDPO:00025,IDPO:00024
15,IDPO:00025,IDPO:00000
17,IDPO:00026,IDPO:00000
18,IDPO:00026,IDPO:00024
19,IDPO:00027,IDPO:00000
20,IDPO:00027,IDPO:00024
22,IDPO:00028,IDPO:00024
21,IDPO:00028,IDPO:00000
23,IDPO:00029,IDPO:00000


### GO

#### Remove edges different to `is_a`

In [7]:
# The OBO must have "ontology: GO" header (first line)
graph = obonet.read_obo(f"{dir_data_disprot}/go-basic.obo")
to_remove = []
for e in graph.edges:
    if e[2] != 'is_a':
        to_remove.append((e[0], e[1]))

for ele in to_remove:
    graph.remove_edge(*ele)


In [8]:

# for node in graph.nodes(data=True):
#     pprint(node)
#     break
# #df_ont = pd.DataFrame([[node[0], node[1]['name']] for node in graph.nodes(data=True) if node[1]['namespace'] == 'Disorder_function'], columns=['term', 'name'])
# df_ont = pd.DataFrame([[node[0], node[1]['name'], node[1]['namespace']]  for node in graph.nodes(data=True) \
#                         if (node[1]['namespace'] == 'biological_process') or \
#                            (node[1]['namespace'] == 'molecular_function') or \
#                            (node[1]['namespace'] == 'cellular_component') ], columns=['term', 'name', 'namespace'])
# df_ont

#### Ancestors

In [9]:
# # Create the ancestors table
# df_depth = []
# df_ont_ancestors = []
# for node in graph.nodes(data=True): #GO:0005488 binding
#     #if node[1]['namespace'] == 'binding':
#     if node[1]['namespace'] == 'molecular_function':
# #         print(node[0], networkx.descendants(graph, node[0]), node[1].get('is_a'))
#         for d in networkx.descendants(graph, node[0]):
#             df_ont_ancestors.append([node[0], d])
#             df_depth.append([node[0], len(list(networkx.all_shortest_paths(graph, source=node[0], target='GO:0003674'))[0])])
        
# df_ont_ancestors = pd.DataFrame(df_ont_ancestors, columns=['term', 'ancestor']).sort_values(by='term')
# df_depth = pd.DataFrame(df_depth, columns=['term', 'depth']).sort_values(by='term')
# df_ont_ancestors[df_ont_ancestors["term"] == "GO:0005488"]

In [10]:
# df_ont_ancestors[df_ont_ancestors["term"] == "GO:0002151"]

#### Children

In [11]:
# Parents
challenge_ancestors = [ ('GO:0005488', 'binding')
                      , ('GO:0003676', 'nucleic acid binding')
                      , ('GO:0005515', 'protein binding') ]
# Create children table
challenges_go = []
for go_term, challenge in challenge_ancestors:
    for node in graph.nodes(data=True):
        if node[0] == go_term:
            challenges_go.append([go_term, challenge])
            for d in networkx.ancestors(graph, node[0]): 
                challenges_go.append([d, challenge])

#### Ontology terms for challanges

In [12]:
# Main GO terms
# { "GO:0008150": "biological_process"
# , "GO:0003674": "molecular_function"
# , "GO:0005575": "cellular_component" }

challenges_idpo = [ ('IDPO:00076', 'disorder')
                  , ('IDPO:00077', 'disorder')
                  , ('IDPO:00078', 'disorder')
                  , ('IDPO:00501', 'linker')
                  , ('IDPO:00502', 'linker')
                  , ('IDPO:00503', 'linker')
                  , ('IDPO:00504', 'linker')
                  , ('IDPO:00049', 'transition')
                  , ('IDPO:00050', 'transition')
                  , ('IDPO:00051', 'transition')
                  , ('IDPO:00052', 'transition')
                  , ('IDPO:00053', 'transition')
                  , ('IDPO:00060', 'transition')
                  , ('IDPO:00055', 'transition')
                  , ('IDPO:00056', 'transition')
                  , ('IDPO:00061', 'transition')
                  , ('IDPO:00054', 'transition')
                  , ('IDPO:00057', 'transition')
                  , ('IDPO:00058', 'transition')
                  , ('IDPO:00059', 'transition')]

# challenges_go = list(zip( df_ont_children['child']
#                         , df_ont_children['challenge']))

challenges = challenges_go + challenges_idpo
challenges

[['GO:0005488', 'binding'],
 ['GO:0042835', 'binding'],
 ['GO:0034186', 'binding'],
 ['GO:0031857', 'binding'],
 ['GO:0019843', 'binding'],
 ['GO:0009374', 'binding'],
 ['GO:0050750', 'binding'],
 ['GO:0030519', 'binding'],
 ['GO:0001872', 'binding'],
 ['GO:0001515', 'binding'],
 ['GO:0017130', 'binding'],
 ['GO:0070052', 'binding'],
 ['GO:0001161', 'binding'],
 ['GO:0044589', 'binding'],
 ['GO:0030560', 'binding'],
 ['GO:0050815', 'binding'],
 ['GO:0097322', 'binding'],
 ['GO:0005515', 'binding'],
 ['GO:0051861', 'binding'],
 ['GO:1901265', 'binding'],
 ['GO:0010181', 'binding'],
 ['GO:0099567', 'binding'],
 ['GO:1905577', 'binding'],
 ['GO:0001664', 'binding'],
 ['GO:0033402', 'binding'],
 ['GO:0034185', 'binding'],
 ['GO:0019973', 'binding'],
 ['GO:0042393', 'binding'],
 ['GO:0005519', 'binding'],
 ['GO:0031369', 'binding'],
 ['GO:0031700', 'binding'],
 ['GO:0016015', 'binding'],
 ['GO:0034236', 'binding'],
 ['GO:1903877', 'binding'],
 ['GO:0097371', 'binding'],
 ['GO:0001791', 'bin

In [13]:
# Be aware that the column name for `term_id` is written correctly.
df_challenges = pd.DataFrame(data=challenges, columns=['term_id', 'challenge'])
df_challenges

Unnamed: 0,term_id,challenge
0,GO:0005488,binding
1,GO:0042835,binding
2,GO:0034186,binding
3,GO:0031857,binding
4,GO:0019843,binding
...,...,...
3082,IDPO:00061,transition
3083,IDPO:00054,transition
3084,IDPO:00057,transition
3085,IDPO:00058,transition


## CAID2 Dataframe from json

In [14]:
main_columns = [ 'disprot_id'
               , 'acc'
               , 'name'
               , 'ncbi_taxon_id'           # int
               , 'organism'
               , 'sequence'
               , 'taxonomy' ]              # list of str (organisms)

region_columns = [ "disprot_namespace"  # str
                 , "region_id"          # str
                 , "date"               # str
                 , "start"              # int
                 , "end"                # int
                 , "term_id"            # str (GO and IDPO terms)
                 , "term_name"          # str (GO and IDPO description)
                 , "term_namespace"     # str (GO and IDPO namespace)
                 , "term_ontology" ]    # str (type of ontology, e.i.: GO, IDPO)

# """
# Ontology
# DisProt relies on three different ontologies to annotate intrinsically disordered regions, the Intrinsically Disordered Proteins Ontology (IDPO), the Gene Ontology (GO) and the Evidence and Conclusion Ontology (ECO).
#     * IDPO is used to describe structural aspects of an IDP/IDR, self-functions and functions directly associated with their disordered state.
#     * GO is used to describe functional aspects of an IDP/IDR.
#     * ECO describes the technique or evidence associated with an annotation.
# """

### Parsing json to Dataframe

#### Old DisProt json

In [15]:
json_old = []
with open(f"{dir_data_disprot}/entries_2022_06.json", "r") as f:
    for line in f:
        json_old.append(json.loads(line))

In [16]:
multiple_level_data = pd.json_normalize( data          = json_old
                                       , record_path   = ['regions']
                                       , meta          = main_columns
                                       , meta_prefix   = ''
                                       , record_prefix = '' )
multiple_level_data = multiple_level_data.loc[:, main_columns + region_columns]
multiple_level_data

Unnamed: 0,disprot_id,acc,name,ncbi_taxon_id,organism,sequence,taxonomy,disprot_namespace,region_id,date,start,end,term_id,term_name,term_namespace,term_ontology
0,DP00003,P03265,DNA-binding protein,28285,Human adenovirus C serotype 5,MASREEEQRETTPERGRGAARRPPTMEDVSSPSPSPPPPRAPPKKR...,"[Viruses, Varidnaviria, Bamfordvirae, Preplasm...",,DP00003r001,2018-01-31T14:00:00.000Z,174,179,00076,Disorder,Structural state,IDPO
1,DP00003,P03265,DNA-binding protein,28285,Human adenovirus C serotype 5,MASREEEQRETTPERGRGAARRPPTMEDVSSPSPSPPPPRAPPKKR...,"[Viruses, Varidnaviria, Bamfordvirae, Preplasm...",Structural state,DP00003r002,2022-02-14T09:00:00.000Z,294,334,IDPO:00076,disorder,Structural state,IDPO
2,DP00003,P03265,DNA-binding protein,28285,Human adenovirus C serotype 5,MASREEEQRETTPERGRGAARRPPTMEDVSSPSPSPPPPRAPPKKR...,"[Viruses, Varidnaviria, Bamfordvirae, Preplasm...",Disorder function,DP00003r003,2021-02-16T10:44:57.192Z,294,334,00002,Flexible linker/spacer,Disorder function,IDPO
3,DP00003,P03265,DNA-binding protein,28285,Human adenovirus C serotype 5,MASREEEQRETTPERGRGAARRPPTMEDVSSPSPSPPPPRAPPKKR...,"[Viruses, Varidnaviria, Bamfordvirae, Preplasm...",Structural state,DP00003r004,2022-02-14T09:00:00.000Z,454,464,IDPO:00076,disorder,Structural state,IDPO
4,DP00004,P49913,Cathelicidin antimicrobial peptide,9606,Homo sapiens,MKTQRDGHSLGRWSLVLLLLGLVMPLAIIAQVLSYKEAVLRAIDGI...,"[Eukaryota, Metazoa, Chordata, Craniata, Verte...",Structural state,DP00004r001,2022-02-14T09:00:00.000Z,134,170,IDPO:00076,disorder,Structural state,IDPO
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13078,DP03728,Q6CSX2,Serine/threonine-protein kinase ATG1,284590,Kluyveromyces lactis (strain ATCC 8585 / CBS 2...,MSSESHDKVVAKAIRLPTENYSVEKEIGKGSFAVVYKGLSLRDGRN...,"[Eukaryota, Fungi, Dikarya, Ascomycota, Saccha...",Disorder function,DP03728r002,2022-06-07T21:12:35.877Z,562,831,GO:1990316,Atg1/ULK1 kinase complex,Cellular component,GO
13079,DP03729,Q8IYT8,Serine/threonine-protein kinase ULK2,9606,Homo sapiens,MEVVGDFEYSKRDLVGHGAFAVVFRGRHRQKTDWEVAIKSINKKNL...,"[Eukaryota, Metazoa, Chordata, Craniata, Verte...",Structural state,DP03729r001,2022-06-08T14:43:51.005Z,168,177,IDPO:00076,disorder,Structural state,IDPO
13080,DP03731,Q9UHK0,Nuclear fragile X mental retardation-interacti...,9606,Homo sapiens,MAEPTSDFETPIGWHASPELTPTLGPLSDTAPPRDSWMFWAMLPPP...,"[Eukaryota, Metazoa, Chordata, Craniata, Verte...",Structural state,DP03731r001,2022-06-13T12:37:58.500Z,486,495,IDPO:00076,disorder,Structural state,IDPO
13081,DP03731,Q9UHK0,Nuclear fragile X mental retardation-interacti...,9606,Homo sapiens,MAEPTSDFETPIGWHASPELTPTLGPLSDTAPPRDSWMFWAMLPPP...,"[Eukaryota, Metazoa, Chordata, Craniata, Verte...",Disorder function,DP03731r002,2022-06-13T12:37:15.199Z,462,495,GO:0005515,protein binding,Molecular function,GO


In [17]:
set_disprot_ids_old = set(multiple_level_data["disprot_id"].tolist())
len(set_disprot_ids_old)

2620

#### New DisProt json

In [18]:
json_new = []
with open(f"{dir_data_disprot}/entries_2022_06_c.json", "r") as f:
    for line in f:
        json_new.append(json.loads(line))
        #print(dict_all["size"])

In [19]:
multiple_level_data = pd.json_normalize( data          = json_new
                                       , record_path   = ['regions']
                                       , meta          = main_columns
                                       , meta_prefix   = ''
                                       , record_prefix = '' )
multiple_level_data = multiple_level_data.loc[:, main_columns + region_columns]
multiple_level_data

Unnamed: 0,disprot_id,acc,name,ncbi_taxon_id,organism,sequence,taxonomy,disprot_namespace,region_id,date,start,end,term_id,term_name,term_namespace,term_ontology
0,DP00003,P03265,DNA-binding protein,28285,Human adenovirus C serotype 5,MASREEEQRETTPERGRGAARRPPTMEDVSSPSPSPPPPRAPPKKR...,"[Viruses, Varidnaviria, Bamfordvirae, Preplasm...",,DP00003r001,2018-01-31T14:00:00.000Z,174,179,00076,Disorder,Structural state,IDPO
1,DP00003,P03265,DNA-binding protein,28285,Human adenovirus C serotype 5,MASREEEQRETTPERGRGAARRPPTMEDVSSPSPSPPPPRAPPKKR...,"[Viruses, Varidnaviria, Bamfordvirae, Preplasm...",Structural state,DP00003r002,2022-02-14T09:00:00.000Z,294,334,IDPO:00076,disorder,Structural state,IDPO
2,DP00003,P03265,DNA-binding protein,28285,Human adenovirus C serotype 5,MASREEEQRETTPERGRGAARRPPTMEDVSSPSPSPPPPRAPPKKR...,"[Viruses, Varidnaviria, Bamfordvirae, Preplasm...",Disorder function,DP00003r003,2021-02-16T10:44:57.192Z,294,334,00002,Flexible linker/spacer,Disorder function,IDPO
3,DP00003,P03265,DNA-binding protein,28285,Human adenovirus C serotype 5,MASREEEQRETTPERGRGAARRPPTMEDVSSPSPSPPPPRAPPKKR...,"[Viruses, Varidnaviria, Bamfordvirae, Preplasm...",Structural state,DP00003r004,2022-02-14T09:00:00.000Z,454,464,IDPO:00076,disorder,Structural state,IDPO
4,DP00004,P49913,Cathelicidin antimicrobial peptide,9606,Homo sapiens,MKTQRDGHSLGRWSLVLLLLGLVMPLAIIAQVLSYKEAVLRAIDGI...,"[Eukaryota, Metazoa, Chordata, Craniata, Verte...",Structural state,DP00004r001,2022-02-14T09:00:00.000Z,134,170,IDPO:00076,disorder,Structural state,IDPO
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14310,DP03744,Q9VVJ7,CG7484 protein,7227,Drosophila melanogaster,MHKCAIFLLLALSCQQIQAELTAADCRALGFIKAQLMCSSCEKLDD...,"Eukaryota, Metazoa, Ecdysozoa, Arthropoda, Hex...",Disorder function,DP03744r002,2022-06-17T10:48:39.874Z,53,178,GO:0045454,cell redox homeostasis,Biological process,GO
14311,DP03745,Q8VHC3,Selenoprotein M,10090,Mus musculus,MSILLSPPSLLLLLAALVAPATSTTNYRPDWNRLRGLARGRVETCG...,"Eukaryota, Metazoa, Chordata, Craniata, Verteb...",Structural state,DP03745r001,2022-06-17T10:51:32.225Z,25,34,IDPO:00076,disorder,Structural state,IDPO
14312,DP03745,Q8VHC3,Selenoprotein M,10090,Mus musculus,MSILLSPPSLLLLLAALVAPATSTTNYRPDWNRLRGLARGRVETCG...,"Eukaryota, Metazoa, Chordata, Craniata, Verteb...",Structural state,DP03745r002,2022-06-17T10:51:44.007Z,121,145,IDPO:00076,disorder,Structural state,IDPO
14313,DP03745,Q8VHC3,Selenoprotein M,10090,Mus musculus,MSILLSPPSLLLLLAALVAPATSTTNYRPDWNRLRGLARGRVETCG...,"Eukaryota, Metazoa, Chordata, Craniata, Verteb...",Disorder function,DP03745r003,2022-06-17T10:52:51.196Z,24,145,GO:0045454,cell redox homeostasis,Biological process,GO


#### Getting private DisProt entries

In [20]:
set_disprot_ids_new = set(multiple_level_data["disprot_id"].tolist())
len(set_disprot_ids_new)

2988

In [21]:
set_disprot_ids_private = set_disprot_ids_new - set_disprot_ids_old
len(set_disprot_ids_private)

368

##### Manually testing if a disprot entry from july 2022 exists

In [22]:
'DP03753' in set_disprot_ids_private or 'DP03753' in multiple_level_data['disprot_id'].tolist()

False

#### All the keys from the record `regions` in disprot json release 2022_06

##### Regions

In [23]:
# # Filter testing examples
# multiple_level_data = multiple_level_data.loc[multiple_level_data['disprot_id'].isin(disprot_testing_cases.keys())]
# multiple_level_data


In [24]:
# Filter disprot entries to use privates
multiple_level_data = multiple_level_data.loc[multiple_level_data['disprot_id'].isin(list(set_disprot_ids_private))]
multiple_level_data


Unnamed: 0,disprot_id,acc,name,ncbi_taxon_id,organism,sequence,taxonomy,disprot_namespace,region_id,date,start,end,term_id,term_name,term_namespace,term_ontology
10082,DP02342,P06837,Neuromodulin,10090,Mus musculus,MLCCMRRTKQVEKNDEDQKIEQDGVKPEDKAHKAATKIQASFRGHI...,"[Eukaryota, Metazoa, Chordata, Craniata, Verte...",Structural state,DP02342r003,2022-02-14T09:00:00.000Z,1,227,IDPO:00078,pre-molten globule,Structural state,IDPO
10083,DP02342,P06837,Neuromodulin,10090,Mus musculus,MLCCMRRTKQVEKNDEDQKIEQDGVKPEDKAHKAATKIQASFRGHI...,"[Eukaryota, Metazoa, Chordata, Craniata, Verte...",Disorder function,DP02342r006,2022-02-14T09:00:00.000Z,34,57,GO:0051179,localization,Biological process,GO
10084,DP02342,P06837,Neuromodulin,10090,Mus musculus,MLCCMRRTKQVEKNDEDQKIEQDGVKPEDKAHKAATKIQASFRGHI...,"[Eukaryota, Metazoa, Chordata, Craniata, Verte...",Disorder function,DP02342r007,2022-02-14T09:00:00.000Z,34,57,GO:0098772,molecular function regulator,Molecular function,GO
10085,DP02342,P06837,Neuromodulin,10090,Mus musculus,MLCCMRRTKQVEKNDEDQKIEQDGVKPEDKAHKAATKIQASFRGHI...,"[Eukaryota, Metazoa, Chordata, Craniata, Verte...",Structural state,DP02342r009,2022-02-14T09:00:00.000Z,1,227,IDPO:00078,pre-molten globule,Structural state,IDPO
10086,DP02342,P06837,Neuromodulin,10090,Mus musculus,MLCCMRRTKQVEKNDEDQKIEQDGVKPEDKAHKAATKIQASFRGHI...,"[Eukaryota, Metazoa, Chordata, Craniata, Verte...",Disorder function,DP02342r010,2022-02-14T09:00:00.000Z,34,57,GO:0005515,protein binding,Molecular function,GO
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14310,DP03744,Q9VVJ7,CG7484 protein,7227,Drosophila melanogaster,MHKCAIFLLLALSCQQIQAELTAADCRALGFIKAQLMCSSCEKLDD...,"Eukaryota, Metazoa, Ecdysozoa, Arthropoda, Hex...",Disorder function,DP03744r002,2022-06-17T10:48:39.874Z,53,178,GO:0045454,cell redox homeostasis,Biological process,GO
14311,DP03745,Q8VHC3,Selenoprotein M,10090,Mus musculus,MSILLSPPSLLLLLAALVAPATSTTNYRPDWNRLRGLARGRVETCG...,"Eukaryota, Metazoa, Chordata, Craniata, Verteb...",Structural state,DP03745r001,2022-06-17T10:51:32.225Z,25,34,IDPO:00076,disorder,Structural state,IDPO
14312,DP03745,Q8VHC3,Selenoprotein M,10090,Mus musculus,MSILLSPPSLLLLLAALVAPATSTTNYRPDWNRLRGLARGRVETCG...,"Eukaryota, Metazoa, Chordata, Craniata, Verteb...",Structural state,DP03745r002,2022-06-17T10:51:44.007Z,121,145,IDPO:00076,disorder,Structural state,IDPO
14313,DP03745,Q8VHC3,Selenoprotein M,10090,Mus musculus,MSILLSPPSLLLLLAALVAPATSTTNYRPDWNRLRGLARGRVETCG...,"Eukaryota, Metazoa, Chordata, Craniata, Verteb...",Disorder function,DP03745r003,2022-06-17T10:52:51.196Z,24,145,GO:0045454,cell redox homeostasis,Biological process,GO


In [25]:
def expand_region(adf: pd.DataFrame, database: str = 'disprot') -> pd.DataFrame:
    """
    """
    if database == 'disprot':
        adf["reg_position"] = list(range(int(adf["start"]), int(adf["end"]) + 1, 1))
    elif database == 'sifts':
        adf["sp_position"] = list(range(int(adf["SP_BEG"]), int(adf["SP_END"]) + 1, 1))
    else:
        raise ValueError(f"'{database}' is not a choise for parameter 'database'")
    return adf

def expand_seq(adf: pd.DataFrame) -> pd.DataFrame:
    """
    """
    adf["seq_position_aa"] = [(i+1, aa) for i, aa in enumerate(adf["sequence"])]
    return adf


In [26]:
dp_sequence = multiple_level_data[['disprot_id', 'acc', 'sequence']].copy(deep=True).drop_duplicates()
dp_sequence['seq_position_aa'] = multiple_level_data['sequence'].apply(lambda x: [(i+1, aa) for i, aa in enumerate(x)])
dp_sequence = dp_sequence.explode("seq_position_aa")
dp_sequence[['seq_position', 'seq_aa']] = pd.DataFrame(dp_sequence['seq_position_aa'].tolist(), index=dp_sequence.index)
dp_sequence.drop(columns=['seq_position_aa', 'sequence'], inplace=True)
dp_sequence


Unnamed: 0,disprot_id,acc,seq_position,seq_aa
10082,DP02342,P06837,1,M
10082,DP02342,P06837,2,L
10082,DP02342,P06837,3,C
10082,DP02342,P06837,4,C
10082,DP02342,P06837,5,M
...,...,...,...,...
14314,DP03746,Q9QUH6-2,1280,Q
14314,DP03746,Q9QUH6-2,1281,L
14314,DP03746,Q9QUH6-2,1282,L
14314,DP03746,Q9QUH6-2,1283,I


### Dataframe for regions

In [27]:
dp_entry_regions = multiple_level_data.apply(expand_region, database='disprot', axis=1).copy(deep=True)
dp_entry_regions = dp_entry_regions.explode("reg_position")
dp_entry_regions

Unnamed: 0,disprot_id,acc,name,ncbi_taxon_id,organism,sequence,taxonomy,disprot_namespace,region_id,date,start,end,term_id,term_name,term_namespace,term_ontology,reg_position
10082,DP02342,P06837,Neuromodulin,10090,Mus musculus,MLCCMRRTKQVEKNDEDQKIEQDGVKPEDKAHKAATKIQASFRGHI...,"[Eukaryota, Metazoa, Chordata, Craniata, Verte...",Structural state,DP02342r003,2022-02-14T09:00:00.000Z,1,227,IDPO:00078,pre-molten globule,Structural state,IDPO,1
10082,DP02342,P06837,Neuromodulin,10090,Mus musculus,MLCCMRRTKQVEKNDEDQKIEQDGVKPEDKAHKAATKIQASFRGHI...,"[Eukaryota, Metazoa, Chordata, Craniata, Verte...",Structural state,DP02342r003,2022-02-14T09:00:00.000Z,1,227,IDPO:00078,pre-molten globule,Structural state,IDPO,2
10082,DP02342,P06837,Neuromodulin,10090,Mus musculus,MLCCMRRTKQVEKNDEDQKIEQDGVKPEDKAHKAATKIQASFRGHI...,"[Eukaryota, Metazoa, Chordata, Craniata, Verte...",Structural state,DP02342r003,2022-02-14T09:00:00.000Z,1,227,IDPO:00078,pre-molten globule,Structural state,IDPO,3
10082,DP02342,P06837,Neuromodulin,10090,Mus musculus,MLCCMRRTKQVEKNDEDQKIEQDGVKPEDKAHKAATKIQASFRGHI...,"[Eukaryota, Metazoa, Chordata, Craniata, Verte...",Structural state,DP02342r003,2022-02-14T09:00:00.000Z,1,227,IDPO:00078,pre-molten globule,Structural state,IDPO,4
10082,DP02342,P06837,Neuromodulin,10090,Mus musculus,MLCCMRRTKQVEKNDEDQKIEQDGVKPEDKAHKAATKIQASFRGHI...,"[Eukaryota, Metazoa, Chordata, Craniata, Verte...",Structural state,DP02342r003,2022-02-14T09:00:00.000Z,1,227,IDPO:00078,pre-molten globule,Structural state,IDPO,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14314,DP03746,Q9QUH6-2,Isoform 2 of Ras/Rap GTPase-activating protein...,10116,Rattus norvegicus,MSYAPFRDVRGPPMHRTQYVHSPYDRPGWNPRFCIISGNQLLMLDE...,"Eukaryota, Metazoa, Chordata, Craniata, Verteb...",Structural state,DP03746r001,2022-06-22T14:35:20.554Z,350,386,IDPO:00076,disorder,Structural state,IDPO,382
14314,DP03746,Q9QUH6-2,Isoform 2 of Ras/Rap GTPase-activating protein...,10116,Rattus norvegicus,MSYAPFRDVRGPPMHRTQYVHSPYDRPGWNPRFCIISGNQLLMLDE...,"Eukaryota, Metazoa, Chordata, Craniata, Verteb...",Structural state,DP03746r001,2022-06-22T14:35:20.554Z,350,386,IDPO:00076,disorder,Structural state,IDPO,383
14314,DP03746,Q9QUH6-2,Isoform 2 of Ras/Rap GTPase-activating protein...,10116,Rattus norvegicus,MSYAPFRDVRGPPMHRTQYVHSPYDRPGWNPRFCIISGNQLLMLDE...,"Eukaryota, Metazoa, Chordata, Craniata, Verteb...",Structural state,DP03746r001,2022-06-22T14:35:20.554Z,350,386,IDPO:00076,disorder,Structural state,IDPO,384
14314,DP03746,Q9QUH6-2,Isoform 2 of Ras/Rap GTPase-activating protein...,10116,Rattus norvegicus,MSYAPFRDVRGPPMHRTQYVHSPYDRPGWNPRFCIISGNQLLMLDE...,"Eukaryota, Metazoa, Chordata, Craniata, Verteb...",Structural state,DP03746r001,2022-06-22T14:35:20.554Z,350,386,IDPO:00076,disorder,Structural state,IDPO,385


In [28]:

dp_entry_regions = pd.merge( left  = dp_entry_regions
                            , right = df_challenges
                            , how   = "left"
                            , on    = "term_id" )


dp_entry_regions

Unnamed: 0,disprot_id,acc,name,ncbi_taxon_id,organism,sequence,taxonomy,disprot_namespace,region_id,date,start,end,term_id,term_name,term_namespace,term_ontology,reg_position,challenge
0,DP02342,P06837,Neuromodulin,10090,Mus musculus,MLCCMRRTKQVEKNDEDQKIEQDGVKPEDKAHKAATKIQASFRGHI...,"[Eukaryota, Metazoa, Chordata, Craniata, Verte...",Structural state,DP02342r003,2022-02-14T09:00:00.000Z,1,227,IDPO:00078,pre-molten globule,Structural state,IDPO,1,disorder
1,DP02342,P06837,Neuromodulin,10090,Mus musculus,MLCCMRRTKQVEKNDEDQKIEQDGVKPEDKAHKAATKIQASFRGHI...,"[Eukaryota, Metazoa, Chordata, Craniata, Verte...",Structural state,DP02342r003,2022-02-14T09:00:00.000Z,1,227,IDPO:00078,pre-molten globule,Structural state,IDPO,2,disorder
2,DP02342,P06837,Neuromodulin,10090,Mus musculus,MLCCMRRTKQVEKNDEDQKIEQDGVKPEDKAHKAATKIQASFRGHI...,"[Eukaryota, Metazoa, Chordata, Craniata, Verte...",Structural state,DP02342r003,2022-02-14T09:00:00.000Z,1,227,IDPO:00078,pre-molten globule,Structural state,IDPO,3,disorder
3,DP02342,P06837,Neuromodulin,10090,Mus musculus,MLCCMRRTKQVEKNDEDQKIEQDGVKPEDKAHKAATKIQASFRGHI...,"[Eukaryota, Metazoa, Chordata, Craniata, Verte...",Structural state,DP02342r003,2022-02-14T09:00:00.000Z,1,227,IDPO:00078,pre-molten globule,Structural state,IDPO,4,disorder
4,DP02342,P06837,Neuromodulin,10090,Mus musculus,MLCCMRRTKQVEKNDEDQKIEQDGVKPEDKAHKAATKIQASFRGHI...,"[Eukaryota, Metazoa, Chordata, Craniata, Verte...",Structural state,DP02342r003,2022-02-14T09:00:00.000Z,1,227,IDPO:00078,pre-molten globule,Structural state,IDPO,5,disorder
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
141324,DP03746,Q9QUH6-2,Isoform 2 of Ras/Rap GTPase-activating protein...,10116,Rattus norvegicus,MSYAPFRDVRGPPMHRTQYVHSPYDRPGWNPRFCIISGNQLLMLDE...,"Eukaryota, Metazoa, Chordata, Craniata, Verteb...",Structural state,DP03746r001,2022-06-22T14:35:20.554Z,350,386,IDPO:00076,disorder,Structural state,IDPO,382,disorder
141325,DP03746,Q9QUH6-2,Isoform 2 of Ras/Rap GTPase-activating protein...,10116,Rattus norvegicus,MSYAPFRDVRGPPMHRTQYVHSPYDRPGWNPRFCIISGNQLLMLDE...,"Eukaryota, Metazoa, Chordata, Craniata, Verteb...",Structural state,DP03746r001,2022-06-22T14:35:20.554Z,350,386,IDPO:00076,disorder,Structural state,IDPO,383,disorder
141326,DP03746,Q9QUH6-2,Isoform 2 of Ras/Rap GTPase-activating protein...,10116,Rattus norvegicus,MSYAPFRDVRGPPMHRTQYVHSPYDRPGWNPRFCIISGNQLLMLDE...,"Eukaryota, Metazoa, Chordata, Craniata, Verteb...",Structural state,DP03746r001,2022-06-22T14:35:20.554Z,350,386,IDPO:00076,disorder,Structural state,IDPO,384,disorder
141327,DP03746,Q9QUH6-2,Isoform 2 of Ras/Rap GTPase-activating protein...,10116,Rattus norvegicus,MSYAPFRDVRGPPMHRTQYVHSPYDRPGWNPRFCIISGNQLLMLDE...,"Eukaryota, Metazoa, Chordata, Craniata, Verteb...",Structural state,DP03746r001,2022-06-22T14:35:20.554Z,350,386,IDPO:00076,disorder,Structural state,IDPO,385,disorder


In [29]:
dp_entry_regions = dp_entry_regions[['disprot_id', 'acc', 'challenge', 'reg_position']]
dp_entry_regions.dropna(subset='challenge', inplace=True)
dp_entry_regions.drop_duplicates(['disprot_id', 'acc', 'challenge', 'reg_position'], inplace=True)
dp_entry_regions

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dp_entry_regions.dropna(subset='challenge', inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dp_entry_regions.drop_duplicates(['disprot_id', 'acc', 'challenge', 'reg_position'], inplace=True)


Unnamed: 0,disprot_id,acc,challenge,reg_position
0,DP02342,P06837,disorder,1
1,DP02342,P06837,disorder,2
2,DP02342,P06837,disorder,3
3,DP02342,P06837,disorder,4
4,DP02342,P06837,disorder,5
...,...,...,...,...
141324,DP03746,Q9QUH6-2,disorder,382
141325,DP03746,Q9QUH6-2,disorder,383
141326,DP03746,Q9QUH6-2,disorder,384
141327,DP03746,Q9QUH6-2,disorder,385


### SIFTS dataframe

In [30]:
df_sifts = pd.read_csv(f"{dir_data_sifts}/uniprot_segments_observed.tsv.gz", sep="\t", header=1)
df_sifts

Unnamed: 0,PDB,CHAIN,SP_PRIMARY,RES_BEG,RES_END,PDB_BEG,PDB_END,SP_BEG,SP_END
0,113l,A,P00720,1,162,1,162,1,162
1,11gs,A,P09211,3,210,2,209,3,210
2,11gs,B,P09211,3,210,2,209,3,210
3,121p,A,P01112,1,166,1,166,1,166
4,133l,A,P61626,1,130,1,130,19,148
...,...,...,...,...,...,...,...,...,...
962405,7z7o,C,B1PNC0,69,227,71,229,61,219
962406,7z7o,D,B1PNC0,13,67,13,67,3,57
962407,7z7o,D,B1PNC0,68,68,69,69,59,60
962408,7z7o,D,B1PNC0,68,68,69,69,58,58


#### Subsetting dataframe to have only present disprot sequences

In [31]:
df_sifts = df_sifts.loc[df_sifts['SP_PRIMARY'].isin(dp_sequence['acc'].unique().tolist())]
df_sifts

Unnamed: 0,PDB,CHAIN,SP_PRIMARY,RES_BEG,RES_END,PDB_BEG,PDB_END,SP_BEG,SP_END
1082,1h9e,A,P42166,1,56,1,56,2,57
1307,1ig8,A,P04807,18,486,18,486,18,486
1649,1k99,A,P17480,2,91,2,91,103,192
1960,1m7k,A,O95429,19,99,376,456,376,456
2382,1olt,A,P32131,4,13,4,13,4,13
...,...,...,...,...,...,...,...,...,...
961229,7vjy,A,P0DTC1,1,306,1,306,3264,3569
961312,7vk3,A,P0DTC1,1,300,1,300,3264,3563
961313,7vk3,B,P0DTC1,3,301,3,301,3266,3564
961328,7vk7,A,P0DTC1,1,300,1,300,3264,3563


In [32]:
df_sifts = df_sifts.apply(expand_region, database='sifts', axis=1)
df_sifts = df_sifts.explode("sp_position")
df_sifts

Unnamed: 0,PDB,CHAIN,SP_PRIMARY,RES_BEG,RES_END,PDB_BEG,PDB_END,SP_BEG,SP_END,sp_position
1082,1h9e,A,P42166,1,56,1,56,2,57,2
1082,1h9e,A,P42166,1,56,1,56,2,57,3
1082,1h9e,A,P42166,1,56,1,56,2,57,4
1082,1h9e,A,P42166,1,56,1,56,2,57,5
1082,1h9e,A,P42166,1,56,1,56,2,57,6
...,...,...,...,...,...,...,...,...,...,...
961329,7vk7,B,P0DTC1,3,301,3,301,3266,3564,3560
961329,7vk7,B,P0DTC1,3,301,3,301,3266,3564,3561
961329,7vk7,B,P0DTC1,3,301,3,301,3266,3564,3562
961329,7vk7,B,P0DTC1,3,301,3,301,3266,3564,3563


In [33]:
# Add wwPDB observed residues
# Suposition: aa is the same in uniprot and wwPDB
df_sequence_sifts = pd.merge( left     = dp_sequence
                            , right    = df_sifts
                            , left_on  = ['acc', 'seq_position']
                            , right_on = ['SP_PRIMARY', 'sp_position']
                            , how      = 'left' )
df_sequence_sifts

Unnamed: 0,disprot_id,acc,seq_position,seq_aa,PDB,CHAIN,SP_PRIMARY,RES_BEG,RES_END,PDB_BEG,PDB_END,SP_BEG,SP_END,sp_position
0,DP02342,P06837,1,M,,,,,,,,,,
1,DP02342,P06837,2,L,,,,,,,,,,
2,DP02342,P06837,3,C,,,,,,,,,,
3,DP02342,P06837,4,C,,,,,,,,,,
4,DP02342,P06837,5,M,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1394918,DP03746,Q9QUH6-2,1280,Q,,,,,,,,,,
1394919,DP03746,Q9QUH6-2,1281,L,,,,,,,,,,
1394920,DP03746,Q9QUH6-2,1282,L,,,,,,,,,,
1394921,DP03746,Q9QUH6-2,1283,I,,,,,,,,,,


In [34]:
df_sequence_sifts = df_sequence_sifts[['disprot_id', 'acc', 'seq_position', 'seq_aa', 'sp_position']]
df_sequence_sifts.drop_duplicates(inplace=True)
df_sequence_sifts

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_sequence_sifts.drop_duplicates(inplace=True)


Unnamed: 0,disprot_id,acc,seq_position,seq_aa,sp_position
0,DP02342,P06837,1,M,
1,DP02342,P06837,2,L,
2,DP02342,P06837,3,C,
3,DP02342,P06837,4,C,
4,DP02342,P06837,5,M,
...,...,...,...,...,...
1394918,DP03746,Q9QUH6-2,1280,Q,
1394919,DP03746,Q9QUH6-2,1281,L,
1394920,DP03746,Q9QUH6-2,1282,L,
1394921,DP03746,Q9QUH6-2,1283,I,


### Challenges

In [35]:
for challenge in ['disorder', 'linker', 'transition', 'binding', 'nucleic acid binding', 'protein binding']:
    dp_entry_region_challenge = pd.merge( left     = dp_sequence
                                        , right    = dp_entry_regions.loc[dp_entry_regions['challenge'] == challenge]
                                        , left_on  = ["disprot_id", 'acc', 'seq_position']
                                        , right_on = ["disprot_id", 'acc', 'reg_position']
                                        , how      = 'left' )
    with open(f"{dir_results}/{challenge.replace(' ', '_')}.fasta", "w+") as f:
        for disprot_id, df_g in dp_entry_region_challenge.groupby(by='disprot_id'):
            f.write(">{}\n{}\n{}\n".format( disprot_id
                                          , ''.join(df_g['seq_aa'])
                                          , ''.join(['0' if pd.isnull(val) else '1' for val in df_g['challenge']]) ))
    dp_sequence_sifts_challenge = pd.merge( right     = dp_entry_region_challenge
                                          , left    = df_sequence_sifts
                                          , right_on  = ["disprot_id", 'seq_aa', 'acc', 'seq_position', 'reg_position']
                                          , left_on = ["disprot_id", 'seq_aa', 'acc', 'seq_position', 'sp_position']
                                          , how      = 'left' )
    dp_sequence_sifts_challenge['result'] = '-'
    dp_sequence_sifts_challenge.loc[dp_sequence_sifts_challenge['sp_position'].notnull(), 'result'] = '0' 
    dp_sequence_sifts_challenge.loc[dp_sequence_sifts_challenge['challenge'].notnull(), 'result'] = '1'
    with open(f"{dir_results}/pdb-{challenge.replace(' ', '_')}-atleast.fasta", "w+") as f:
        for disprot_id, df_g in dp_sequence_sifts_challenge.groupby(by='disprot_id'):
            f.write(">{}\n{}\n{}\n".format( disprot_id
                                          , ''.join(df_g['seq_aa'])
                                          , ''.join(df_g['result'])))
with open(f"{dir_results}/pdb-atleast.fasta", "w+") as f:
    for disprot_id, df_g in df_sequence_sifts.groupby(by='disprot_id'):
        f.write(">{}\n{}\n{}\n".format( disprot_id
                                      , ''.join(df_g['seq_aa'])
                                      , ''.join(['0' if pd.isnull(val) else '1' for val in df_g['sp_position']])))

#### Disprot + challenge

In [None]:
# challenge = "binding"
# dp_entry_region_challenge = pd.merge( left     = dp_sequence
#                                     , right    = dp_entry_regions.loc[dp_entry_regions['challenge'] == challenge]
#                                     , left_on  = ["disprot_id", 'acc', 'seq_position']
#                                     , right_on = ["disprot_id", 'acc', 'reg_position']
#                                     , how      = 'left' )
# dp_entry_region_challenge


In [None]:
# with open(f"{dir_results}/{challenge}.fasta", "w+") as f:
#     for disprot_id, df_g in dp_entry_region_challenge.groupby(by='disprot_id'):
#         f.write(">{}\n{}\n{}\n".format( disprot_id
#                                       , ''.join(df_g['seq_aa'])
#                                       , ''.join(['0' if pd.isnull(val) else '1' for val in df_g['challenge']]) ))

#### DisProt + Sifts

In [None]:
# with open(f"{dir_results}/pdb-atleast.fasta", "w+") as f:
#     for disprot_id, df_g in df_sequence_sifts.groupby(by='disprot_id'):
#         f.write(">{}\n{}\n{}\n".format( disprot_id
#                                       , ''.join(df_g['seq_aa'])
#                                       , ''.join(['0' if pd.isnull(val) else '1' for val in df_g['sp_position']])))

#### DisProt + Sifts + challenge

In [None]:
# challenge = "binding"
# dp_sequence_sifts_challenge = pd.merge( right     = dp_entry_region_challenge
#                                       , left    = df_sequence_sifts
#                                       , right_on  = ["disprot_id", 'seq_aa', 'acc', 'seq_position', 'reg_position']
#                                       , left_on = ["disprot_id", 'seq_aa', 'acc', 'seq_position', 'sp_position']
#                                       , how      = 'left' )
# dp_sequence_sifts_challenge

In [None]:
# dp_sequence_sifts_challenge['result'] = '-'
# dp_sequence_sifts_challenge.loc[dp_sequence_sifts_challenge['sp_position'].notnull(), 'result'] = '0' 
# dp_sequence_sifts_challenge.loc[dp_sequence_sifts_challenge['challenge'].notnull(), 'result'] = '1'
# dp_sequence_sifts_challenge

In [None]:
# with open(f"{dir_results}/pdb-binding-atleast.fasta", "w+") as f:
#     for disprot_id, df_g in dp_sequence_sifts_challenge.groupby(by='disprot_id'):
#         f.write(">{}\n{}\n{}\n".format( disprot_id
#                                       , ''.join(df_g['seq_aa'])
#                                       , ''.join(df_g['result'])))