# ENVs

**Comments**  
* Variables for directories always starts with '`dir_`' and ends without '`/`'
* Variables for dataframes always starts with '`df_`'

In [1]:
dir_pjs         = "/home/martingb/Projects"
# caid-reference
dir_main         = f"{dir_pjs}/2022/caid2-reference"
dir_data         = f"{dir_main}/data"
dir_data_sifts   = f"{dir_data}/sifts"
dir_data_disprot = f"{dir_data}/disprot"
dir_results      = f"{dir_main}/results"
dir_src          = f"{dir_main}/src"
dir_src_modules  = f"{dir_src}/modules"
dir_tmp          = f"{dir_main}/tmp"

## f-string variables

In [2]:
nl  = "\n"
tab = "\t"

# Imports

In [3]:
import os
import json
import requests
import pandas as pd
import numpy as np
from pprint import pprint

# Functions

# Retrieve release file

The following files should be placed in <root-to-CAID2>/data/annotations`

### Retrieve `disprot-2022_06-all.json`

## Retrieve annotations for CAID2

# Notes for CAID2

* Retrieve the latest DisProt release using the API as a json file
* Build a unify DataFrame using the `<disprot-release>.json": file


## Comments
- DisProt API has problems parsing:
    + Different versions
    + Regions for
        + Namespaces
    + Consensus

## DisProt cases to test

In [4]:
disprot_testing_cases = { "DP00005": "Antitermination protein N"
                        , "DP00009": "Transcription initiation factor IIA subunit 2"
                        , "DP00016": "Cyclin-dependent kinase inhibitor 1"
                        , "DP00040": "High mobility group protein HMG-I/HMG-Y"
                        , "DP00086": "Cellular tumor antigen p53" }

## GO and IDPO terms to be used

In [5]:
# TODO. Damiano has the GO terms
challenges = [ ('GO:0005488', 'binding')
             , ('GO:0005515', 'binding')
             , ('GO:1901363', 'binding')
             , ('GO:0005515', 'binding')
             , ('GO:0097159', 'binding')
             , ('GO:0003676', 'binding')
             , ('GO:0140666', 'binding')
             , ('GO:0003677', 'binding')
             , ('GO:0071667', 'binding')
             , ('GO:0003723', 'binding')
             , ('GO:0001067', 'binding')
             , ('GO:0090079', 'binding')
             , ('GO:0003676', 'nucleic acid binding')
             , ('GO:0140666', 'nucleic acid binding')
             , ('GO:0003677', 'nucleic acid binding')
             , ('GO:0071667', 'nucleic acid binding')
             , ('GO:0003723', 'nucleic acid binding')
             , ('GO:0001067', 'nucleic acid binding')
             , ('GO:0090079', 'nucleic acid binding')
             , ('GO:0005515', 'protein binding')
             , ('IDPO:00501', 'linker')
             , ('IDPO:00504', 'linker')
             , ('IDPO:00503', 'linker')
             , ('IDPO:00502', 'linker')
             , ('IDPO:00076', 'disorder')
             , ('IDPO:00077', 'disorder')
             , ('IDPO:00078', 'disorder')
             , ('IDPO:00049', 'transition')
             , ('IDPO:00050', 'transition')
             , ('IDPO:00051', 'transition')
             , ('IDPO:00052', 'transition')
             , ('IDPO:00053', 'transition')
             , ('IDPO:00060', 'transition')
             , ('IDPO:00055', 'transition')
             , ('IDPO:00056', 'transition')
             , ('IDPO:00061', 'transition')
             , ('IDPO:00054', 'transition')
             , ('IDPO:00057', 'transition')
             , ('IDPO:00058', 'transition')
             , ('IDPO:00059', 'transition')]


## CAID2 Dataframe from json

In [6]:
main_columns = [ 'disprot_id'
               , 'acc'
               , 'name'
               , 'ncbi_taxon_id'           # int
               , 'organism'
               , 'sequence'
               , 'taxonomy' ]              # list of str (organisms)

region_columns = [ "disprot_namespace"  # str
                 , "region_id"          # str
                 , "date"               # str
                 , "start"              # int
                 , "end"                # int
                 , "term_id"            # str (GO and IDPO terms)
                 , "term_name"          # str (GO and IDPO description)
                 , "term_namespace"     # str (GO and IDPO namespace)
                 , "term_ontology" ]    # str (type of ontology, e.i.: GO, IDPO)

# """
# Ontology
# DisProt relies on three different ontologies to annotate intrinsically disordered regions, the Intrinsically Disordered Proteins Ontology (IDPO), the Gene Ontology (GO) and the Evidence and Conclusion Ontology (ECO).
#     * IDPO is used to describe structural aspects of an IDP/IDR, self-functions and functions directly associated with their disordered state.
#     * GO is used to describe functional aspects of an IDP/IDR.
#     * ECO describes the technique or evidence associated with an annotation.
# """

### [NEW] - Parsing json to Dataframe

In [7]:
# To make a merge.
# Be aware that the column name for `term_id` is written correctly.
df_challenges = pd.DataFrame(data=challenges, columns=['term_id', 'challenge'])
df_challenges

Unnamed: 0,term_id,challenge
0,GO:0005488,binding
1,GO:0005515,binding
2,GO:1901363,binding
3,GO:0005515,binding
4,GO:0097159,binding
5,GO:0003676,binding
6,GO:0140666,binding
7,GO:0003677,binding
8,GO:0071667,binding
9,GO:0003723,binding


#### Old DisProt json

In [8]:
json_old = []
with open(f"{dir_data_disprot}/entries_2022_06.json", "r") as f:
    for line in f:
        json_old.append(json.loads(line))

In [9]:
multiple_level_data = pd.json_normalize( data          = json_old
                                       , record_path   = ['regions']
                                       , meta          = main_columns
                                       , meta_prefix   = ''
                                       , record_prefix = '' )
multiple_level_data = multiple_level_data.loc[:, main_columns + region_columns]
multiple_level_data

Unnamed: 0,disprot_id,acc,name,ncbi_taxon_id,organism,sequence,taxonomy,disprot_namespace,region_id,date,start,end,term_id,term_name,term_namespace,term_ontology
0,DP00003,P03265,DNA-binding protein,28285,Human adenovirus C serotype 5,MASREEEQRETTPERGRGAARRPPTMEDVSSPSPSPPPPRAPPKKR...,"[Viruses, Varidnaviria, Bamfordvirae, Preplasm...",,DP00003r001,2018-01-31T14:00:00.000Z,174,179,00076,Disorder,Structural state,IDPO
1,DP00003,P03265,DNA-binding protein,28285,Human adenovirus C serotype 5,MASREEEQRETTPERGRGAARRPPTMEDVSSPSPSPPPPRAPPKKR...,"[Viruses, Varidnaviria, Bamfordvirae, Preplasm...",Structural state,DP00003r002,2022-02-14T09:00:00.000Z,294,334,IDPO:00076,disorder,Structural state,IDPO
2,DP00003,P03265,DNA-binding protein,28285,Human adenovirus C serotype 5,MASREEEQRETTPERGRGAARRPPTMEDVSSPSPSPPPPRAPPKKR...,"[Viruses, Varidnaviria, Bamfordvirae, Preplasm...",Disorder function,DP00003r003,2021-02-16T10:44:57.192Z,294,334,00002,Flexible linker/spacer,Disorder function,IDPO
3,DP00003,P03265,DNA-binding protein,28285,Human adenovirus C serotype 5,MASREEEQRETTPERGRGAARRPPTMEDVSSPSPSPPPPRAPPKKR...,"[Viruses, Varidnaviria, Bamfordvirae, Preplasm...",Structural state,DP00003r004,2022-02-14T09:00:00.000Z,454,464,IDPO:00076,disorder,Structural state,IDPO
4,DP00004,P49913,Cathelicidin antimicrobial peptide,9606,Homo sapiens,MKTQRDGHSLGRWSLVLLLLGLVMPLAIIAQVLSYKEAVLRAIDGI...,"[Eukaryota, Metazoa, Chordata, Craniata, Verte...",Structural state,DP00004r001,2022-02-14T09:00:00.000Z,134,170,IDPO:00076,disorder,Structural state,IDPO
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13078,DP03728,Q6CSX2,Serine/threonine-protein kinase ATG1,284590,Kluyveromyces lactis (strain ATCC 8585 / CBS 2...,MSSESHDKVVAKAIRLPTENYSVEKEIGKGSFAVVYKGLSLRDGRN...,"[Eukaryota, Fungi, Dikarya, Ascomycota, Saccha...",Disorder function,DP03728r002,2022-06-07T21:12:35.877Z,562,831,GO:1990316,Atg1/ULK1 kinase complex,Cellular component,GO
13079,DP03729,Q8IYT8,Serine/threonine-protein kinase ULK2,9606,Homo sapiens,MEVVGDFEYSKRDLVGHGAFAVVFRGRHRQKTDWEVAIKSINKKNL...,"[Eukaryota, Metazoa, Chordata, Craniata, Verte...",Structural state,DP03729r001,2022-06-08T14:43:51.005Z,168,177,IDPO:00076,disorder,Structural state,IDPO
13080,DP03731,Q9UHK0,Nuclear fragile X mental retardation-interacti...,9606,Homo sapiens,MAEPTSDFETPIGWHASPELTPTLGPLSDTAPPRDSWMFWAMLPPP...,"[Eukaryota, Metazoa, Chordata, Craniata, Verte...",Structural state,DP03731r001,2022-06-13T12:37:58.500Z,486,495,IDPO:00076,disorder,Structural state,IDPO
13081,DP03731,Q9UHK0,Nuclear fragile X mental retardation-interacti...,9606,Homo sapiens,MAEPTSDFETPIGWHASPELTPTLGPLSDTAPPRDSWMFWAMLPPP...,"[Eukaryota, Metazoa, Chordata, Craniata, Verte...",Disorder function,DP03731r002,2022-06-13T12:37:15.199Z,462,495,GO:0005515,protein binding,Molecular function,GO


In [10]:
set_disprot_ids_old = set(multiple_level_data["disprot_id"].unique().tolist())
len(set_disprot_ids_old)

2620

#### New DisProt json

In [11]:
json_new = []
with open(f"{dir_data_disprot}/entries_2022_06_c.json", "r") as f:
    for line in f:
        json_new.append(json.loads(line))
        #print(dict_all["size"])

In [12]:
multiple_level_data = pd.json_normalize( data          = json_new
                                       , record_path   = ['regions']
                                       , meta          = main_columns
                                       , meta_prefix   = ''
                                       , record_prefix = '' )
multiple_level_data = multiple_level_data.loc[:, main_columns + region_columns]
multiple_level_data

Unnamed: 0,disprot_id,acc,name,ncbi_taxon_id,organism,sequence,taxonomy,disprot_namespace,region_id,date,start,end,term_id,term_name,term_namespace,term_ontology
0,DP00003,P03265,DNA-binding protein,28285,Human adenovirus C serotype 5,MASREEEQRETTPERGRGAARRPPTMEDVSSPSPSPPPPRAPPKKR...,"[Viruses, Varidnaviria, Bamfordvirae, Preplasm...",,DP00003r001,2018-01-31T14:00:00.000Z,174,179,00076,Disorder,Structural state,IDPO
1,DP00003,P03265,DNA-binding protein,28285,Human adenovirus C serotype 5,MASREEEQRETTPERGRGAARRPPTMEDVSSPSPSPPPPRAPPKKR...,"[Viruses, Varidnaviria, Bamfordvirae, Preplasm...",Structural state,DP00003r002,2022-02-14T09:00:00.000Z,294,334,IDPO:00076,disorder,Structural state,IDPO
2,DP00003,P03265,DNA-binding protein,28285,Human adenovirus C serotype 5,MASREEEQRETTPERGRGAARRPPTMEDVSSPSPSPPPPRAPPKKR...,"[Viruses, Varidnaviria, Bamfordvirae, Preplasm...",Disorder function,DP00003r003,2021-02-16T10:44:57.192Z,294,334,00002,Flexible linker/spacer,Disorder function,IDPO
3,DP00003,P03265,DNA-binding protein,28285,Human adenovirus C serotype 5,MASREEEQRETTPERGRGAARRPPTMEDVSSPSPSPPPPRAPPKKR...,"[Viruses, Varidnaviria, Bamfordvirae, Preplasm...",Structural state,DP00003r004,2022-02-14T09:00:00.000Z,454,464,IDPO:00076,disorder,Structural state,IDPO
4,DP00004,P49913,Cathelicidin antimicrobial peptide,9606,Homo sapiens,MKTQRDGHSLGRWSLVLLLLGLVMPLAIIAQVLSYKEAVLRAIDGI...,"[Eukaryota, Metazoa, Chordata, Craniata, Verte...",Structural state,DP00004r001,2022-02-14T09:00:00.000Z,134,170,IDPO:00076,disorder,Structural state,IDPO
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14310,DP03744,Q9VVJ7,CG7484 protein,7227,Drosophila melanogaster,MHKCAIFLLLALSCQQIQAELTAADCRALGFIKAQLMCSSCEKLDD...,"Eukaryota, Metazoa, Ecdysozoa, Arthropoda, Hex...",Disorder function,DP03744r002,2022-06-17T10:48:39.874Z,53,178,GO:0045454,cell redox homeostasis,Biological process,GO
14311,DP03745,Q8VHC3,Selenoprotein M,10090,Mus musculus,MSILLSPPSLLLLLAALVAPATSTTNYRPDWNRLRGLARGRVETCG...,"Eukaryota, Metazoa, Chordata, Craniata, Verteb...",Structural state,DP03745r001,2022-06-17T10:51:32.225Z,25,34,IDPO:00076,disorder,Structural state,IDPO
14312,DP03745,Q8VHC3,Selenoprotein M,10090,Mus musculus,MSILLSPPSLLLLLAALVAPATSTTNYRPDWNRLRGLARGRVETCG...,"Eukaryota, Metazoa, Chordata, Craniata, Verteb...",Structural state,DP03745r002,2022-06-17T10:51:44.007Z,121,145,IDPO:00076,disorder,Structural state,IDPO
14313,DP03745,Q8VHC3,Selenoprotein M,10090,Mus musculus,MSILLSPPSLLLLLAALVAPATSTTNYRPDWNRLRGLARGRVETCG...,"Eukaryota, Metazoa, Chordata, Craniata, Verteb...",Disorder function,DP03745r003,2022-06-17T10:52:51.196Z,24,145,GO:0045454,cell redox homeostasis,Biological process,GO


#### Getting private DisProt entries

In [13]:
set_disprot_ids_new = set(multiple_level_data["disprot_id"].unique().tolist())
len(set_disprot_ids_new)

2988

In [14]:
set_disprot_ids_private = set_disprot_ids_new - set_disprot_ids_old
len(set_disprot_ids_private)

368

In [15]:
'DP03753' in set_disprot_ids_private or 'DP03753' in multiple_level_data['disprot_id']

False

#### All the keys from the record `regions` in disprot json release 2022_06

##### Regions

In [16]:
# Filter testing examples
multiple_level_data = multiple_level_data.loc[multiple_level_data['disprot_id'].isin(disprot_testing_cases.keys())]
multiple_level_data


Unnamed: 0,disprot_id,acc,name,ncbi_taxon_id,organism,sequence,taxonomy,disprot_namespace,region_id,date,start,end,term_id,term_name,term_namespace,term_ontology
7,DP00005,P03045,Antitermination protein N,10710,Escherichia phage lambda,MDAQTRRRERRAEKQAQWKAANPLLVGVSAKPVNRPILSLNRKPKS...,"[Viruses, Duplodnaviria, Heunggongvirae, Urovi...",Structural state,DP00005r001,2022-02-14T09:00:00.000Z,1,107,IDPO:00076,disorder,Structural state,IDPO
8,DP00005,P03045,Antitermination protein N,10710,Escherichia phage lambda,MDAQTRRRERRAEKQAQWKAANPLLVGVSAKPVNRPILSLNRKPKS...,"[Viruses, Duplodnaviria, Heunggongvirae, Urovi...",Structural transition,DP00005r002,2021-06-07T12:37:21.927Z,1,107,IDPO:00050,disorder to order,Structural transition,IDPO
9,DP00005,P03045,Antitermination protein N,10710,Escherichia phage lambda,MDAQTRRRERRAEKQAQWKAANPLLVGVSAKPVNRPILSLNRKPKS...,"[Viruses, Duplodnaviria, Heunggongvirae, Urovi...",Interaction partner,DP00005r003,2021-06-07T12:38:58.840Z,1,107,IDPO:00065,DNA binding,Interaction partner,IDPO
10,DP00005,P03045,Antitermination protein N,10710,Escherichia phage lambda,MDAQTRRRERRAEKQAQWKAANPLLVGVSAKPVNRPILSLNRKPKS...,"[Viruses, Duplodnaviria, Heunggongvirae, Urovi...",Structural state,DP00005r004,2022-02-14T09:00:00.000Z,1,107,IDPO:00076,disorder,Structural state,IDPO
11,DP00005,P03045,Antitermination protein N,10710,Escherichia phage lambda,MDAQTRRRERRAEKQAQWKAANPLLVGVSAKPVNRPILSLNRKPKS...,"[Viruses, Duplodnaviria, Heunggongvirae, Urovi...",Structural state,DP00005r005,2022-02-14T09:00:00.000Z,1,107,IDPO:00076,disorder,Structural state,IDPO
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
975,DP00086,P04637,Cellular tumor antigen p53,9606,Homo sapiens,MEEPQSDPSVEPPLSQETFSDLWKLLPENNVLSPLPSQAMDDLMLS...,"[Eukaryota, Metazoa, Chordata, Craniata, Verte...",Disorder function,DP00086r082,2022-02-14T09:00:00.000Z,38,61,GO:0005515,protein binding,Molecular function,GO
976,DP00086,P04637,Cellular tumor antigen p53,9606,Homo sapiens,MEEPQSDPSVEPPLSQETFSDLWKLLPENNVLSPLPSQAMDDLMLS...,"[Eukaryota, Metazoa, Chordata, Craniata, Verte...",Disorder function,DP00086r083,2022-02-14T09:00:00.000Z,1,61,IDPO:00506,self-inhibition,Disorder function,IDPO
977,DP00086,P04637,Cellular tumor antigen p53,9606,Homo sapiens,MEEPQSDPSVEPPLSQETFSDLWKLLPENNVLSPLPSQAMDDLMLS...,"[Eukaryota, Metazoa, Chordata, Craniata, Verte...",Disorder function,DP00086r084,2022-02-14T09:00:00.000Z,1,61,IDPO:00506,self-inhibition,Disorder function,IDPO
978,DP00086,P04637,Cellular tumor antigen p53,9606,Homo sapiens,MEEPQSDPSVEPPLSQETFSDLWKLLPENNVLSPLPSQAMDDLMLS...,"[Eukaryota, Metazoa, Chordata, Craniata, Verte...",Structural state,DP00086r085,2022-04-07T12:09:01.526Z,361,393,IDPO:00076,disorder,Structural state,IDPO


In [17]:
def expand_region(adf: pd.DataFrame, database: str = 'disprot') -> pd.DataFrame:
    """
    """
    if database == 'disprot':
        adf["reg_position"] = list(range(int(adf["start"]), int(adf["end"]) + 1, 1))
    elif database == 'sifts':
        adf["sp_position"] = list(range(int(adf["SP_BEG"]), int(adf["SP_END"]) + 1, 1))
    else:
        raise ValueError(f"'{database}' is not a choise for parameter 'database'")
    return adf

def expand_seq(adf: pd.DataFrame) -> pd.DataFrame:
    """
    """
    adf["seq_position_aa"] = [(i+1, aa) for i, aa in enumerate(adf["sequence"])]
    return adf


In [18]:
dp_sequence = multiple_level_data[['disprot_id', 'acc', 'sequence']].copy(deep=True).drop_duplicates()
dp_sequence['seq_position_aa'] = multiple_level_data['sequence'].apply(lambda x: [(i+1, aa) for i, aa in enumerate(x)])
dp_sequence = dp_sequence.explode("seq_position_aa")
dp_sequence[['seq_position', 'seq_aa']] = pd.DataFrame(dp_sequence['seq_position_aa'].tolist(), index=dp_sequence.index)
dp_sequence.drop(columns=['seq_position_aa', 'sequence'], inplace=True)
dp_sequence


Unnamed: 0,disprot_id,acc,seq_position,seq_aa
7,DP00005,P03045,1,M
7,DP00005,P03045,2,D
7,DP00005,P03045,3,A
7,DP00005,P03045,4,Q
7,DP00005,P03045,5,T
...,...,...,...,...
894,DP00086,P04637,389,G
894,DP00086,P04637,390,P
894,DP00086,P04637,391,D
894,DP00086,P04637,392,S


### Dataframe for regions

In [19]:
dp_entry_regions = multiple_level_data.apply(expand_region, database='disprot', axis=1).copy(deep=True)
dp_entry_regions = dp_entry_regions.explode("reg_position")
dp_entry_regions

Unnamed: 0,disprot_id,acc,name,ncbi_taxon_id,organism,sequence,taxonomy,disprot_namespace,region_id,date,start,end,term_id,term_name,term_namespace,term_ontology,reg_position
7,DP00005,P03045,Antitermination protein N,10710,Escherichia phage lambda,MDAQTRRRERRAEKQAQWKAANPLLVGVSAKPVNRPILSLNRKPKS...,"[Viruses, Duplodnaviria, Heunggongvirae, Urovi...",Structural state,DP00005r001,2022-02-14T09:00:00.000Z,1,107,IDPO:00076,disorder,Structural state,IDPO,1
7,DP00005,P03045,Antitermination protein N,10710,Escherichia phage lambda,MDAQTRRRERRAEKQAQWKAANPLLVGVSAKPVNRPILSLNRKPKS...,"[Viruses, Duplodnaviria, Heunggongvirae, Urovi...",Structural state,DP00005r001,2022-02-14T09:00:00.000Z,1,107,IDPO:00076,disorder,Structural state,IDPO,2
7,DP00005,P03045,Antitermination protein N,10710,Escherichia phage lambda,MDAQTRRRERRAEKQAQWKAANPLLVGVSAKPVNRPILSLNRKPKS...,"[Viruses, Duplodnaviria, Heunggongvirae, Urovi...",Structural state,DP00005r001,2022-02-14T09:00:00.000Z,1,107,IDPO:00076,disorder,Structural state,IDPO,3
7,DP00005,P03045,Antitermination protein N,10710,Escherichia phage lambda,MDAQTRRRERRAEKQAQWKAANPLLVGVSAKPVNRPILSLNRKPKS...,"[Viruses, Duplodnaviria, Heunggongvirae, Urovi...",Structural state,DP00005r001,2022-02-14T09:00:00.000Z,1,107,IDPO:00076,disorder,Structural state,IDPO,4
7,DP00005,P03045,Antitermination protein N,10710,Escherichia phage lambda,MDAQTRRRERRAEKQAQWKAANPLLVGVSAKPVNRPILSLNRKPKS...,"[Viruses, Duplodnaviria, Heunggongvirae, Urovi...",Structural state,DP00005r001,2022-02-14T09:00:00.000Z,1,107,IDPO:00076,disorder,Structural state,IDPO,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
979,DP00086,P04637,Cellular tumor antigen p53,9606,Homo sapiens,MEEPQSDPSVEPPLSQETFSDLWKLLPENNVLSPLPSQAMDDLMLS...,"[Eukaryota, Metazoa, Chordata, Craniata, Verte...",Structural state,DP00086r086,2022-02-14T09:00:00.000Z,361,393,IDPO:00076,disorder,Structural state,IDPO,389
979,DP00086,P04637,Cellular tumor antigen p53,9606,Homo sapiens,MEEPQSDPSVEPPLSQETFSDLWKLLPENNVLSPLPSQAMDDLMLS...,"[Eukaryota, Metazoa, Chordata, Craniata, Verte...",Structural state,DP00086r086,2022-02-14T09:00:00.000Z,361,393,IDPO:00076,disorder,Structural state,IDPO,390
979,DP00086,P04637,Cellular tumor antigen p53,9606,Homo sapiens,MEEPQSDPSVEPPLSQETFSDLWKLLPENNVLSPLPSQAMDDLMLS...,"[Eukaryota, Metazoa, Chordata, Craniata, Verte...",Structural state,DP00086r086,2022-02-14T09:00:00.000Z,361,393,IDPO:00076,disorder,Structural state,IDPO,391
979,DP00086,P04637,Cellular tumor antigen p53,9606,Homo sapiens,MEEPQSDPSVEPPLSQETFSDLWKLLPENNVLSPLPSQAMDDLMLS...,"[Eukaryota, Metazoa, Chordata, Craniata, Verte...",Structural state,DP00086r086,2022-02-14T09:00:00.000Z,361,393,IDPO:00076,disorder,Structural state,IDPO,392


In [20]:

dp_entry_regions = pd.merge( left  = dp_entry_regions
                            , right = df_challenges
                            , how   = "left"
                            , on    = "term_id" )


dp_entry_regions

Unnamed: 0,disprot_id,acc,name,ncbi_taxon_id,organism,sequence,taxonomy,disprot_namespace,region_id,date,start,end,term_id,term_name,term_namespace,term_ontology,reg_position,challenge
0,DP00005,P03045,Antitermination protein N,10710,Escherichia phage lambda,MDAQTRRRERRAEKQAQWKAANPLLVGVSAKPVNRPILSLNRKPKS...,"[Viruses, Duplodnaviria, Heunggongvirae, Urovi...",Structural state,DP00005r001,2022-02-14T09:00:00.000Z,1,107,IDPO:00076,disorder,Structural state,IDPO,1,disorder
1,DP00005,P03045,Antitermination protein N,10710,Escherichia phage lambda,MDAQTRRRERRAEKQAQWKAANPLLVGVSAKPVNRPILSLNRKPKS...,"[Viruses, Duplodnaviria, Heunggongvirae, Urovi...",Structural state,DP00005r001,2022-02-14T09:00:00.000Z,1,107,IDPO:00076,disorder,Structural state,IDPO,2,disorder
2,DP00005,P03045,Antitermination protein N,10710,Escherichia phage lambda,MDAQTRRRERRAEKQAQWKAANPLLVGVSAKPVNRPILSLNRKPKS...,"[Viruses, Duplodnaviria, Heunggongvirae, Urovi...",Structural state,DP00005r001,2022-02-14T09:00:00.000Z,1,107,IDPO:00076,disorder,Structural state,IDPO,3,disorder
3,DP00005,P03045,Antitermination protein N,10710,Escherichia phage lambda,MDAQTRRRERRAEKQAQWKAANPLLVGVSAKPVNRPILSLNRKPKS...,"[Viruses, Duplodnaviria, Heunggongvirae, Urovi...",Structural state,DP00005r001,2022-02-14T09:00:00.000Z,1,107,IDPO:00076,disorder,Structural state,IDPO,4,disorder
4,DP00005,P03045,Antitermination protein N,10710,Escherichia phage lambda,MDAQTRRRERRAEKQAQWKAANPLLVGVSAKPVNRPILSLNRKPKS...,"[Viruses, Duplodnaviria, Heunggongvirae, Urovi...",Structural state,DP00005r001,2022-02-14T09:00:00.000Z,1,107,IDPO:00076,disorder,Structural state,IDPO,5,disorder
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16219,DP00086,P04637,Cellular tumor antigen p53,9606,Homo sapiens,MEEPQSDPSVEPPLSQETFSDLWKLLPENNVLSPLPSQAMDDLMLS...,"[Eukaryota, Metazoa, Chordata, Craniata, Verte...",Structural state,DP00086r086,2022-02-14T09:00:00.000Z,361,393,IDPO:00076,disorder,Structural state,IDPO,389,disorder
16220,DP00086,P04637,Cellular tumor antigen p53,9606,Homo sapiens,MEEPQSDPSVEPPLSQETFSDLWKLLPENNVLSPLPSQAMDDLMLS...,"[Eukaryota, Metazoa, Chordata, Craniata, Verte...",Structural state,DP00086r086,2022-02-14T09:00:00.000Z,361,393,IDPO:00076,disorder,Structural state,IDPO,390,disorder
16221,DP00086,P04637,Cellular tumor antigen p53,9606,Homo sapiens,MEEPQSDPSVEPPLSQETFSDLWKLLPENNVLSPLPSQAMDDLMLS...,"[Eukaryota, Metazoa, Chordata, Craniata, Verte...",Structural state,DP00086r086,2022-02-14T09:00:00.000Z,361,393,IDPO:00076,disorder,Structural state,IDPO,391,disorder
16222,DP00086,P04637,Cellular tumor antigen p53,9606,Homo sapiens,MEEPQSDPSVEPPLSQETFSDLWKLLPENNVLSPLPSQAMDDLMLS...,"[Eukaryota, Metazoa, Chordata, Craniata, Verte...",Structural state,DP00086r086,2022-02-14T09:00:00.000Z,361,393,IDPO:00076,disorder,Structural state,IDPO,392,disorder


In [21]:
dp_entry_regions = dp_entry_regions[['disprot_id', 'acc', 'challenge', 'reg_position']]
dp_entry_regions.dropna(subset='challenge', inplace=True)
dp_entry_regions.drop_duplicates(['disprot_id', 'acc', 'challenge', 'reg_position'], inplace=True)
dp_entry_regions

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dp_entry_regions.dropna(subset='challenge', inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dp_entry_regions.drop_duplicates(['disprot_id', 'acc', 'challenge', 'reg_position'], inplace=True)


Unnamed: 0,disprot_id,acc,challenge,reg_position
0,DP00005,P03045,disorder,1
1,DP00005,P03045,disorder,2
2,DP00005,P03045,disorder,3
3,DP00005,P03045,disorder,4
4,DP00005,P03045,disorder,5
...,...,...,...,...
15788,DP00086,P04637,nucleic acid binding,57
15790,DP00086,P04637,nucleic acid binding,58
15792,DP00086,P04637,nucleic acid binding,59
15794,DP00086,P04637,nucleic acid binding,60


### SIFTS dataframe

In [22]:
df_sifts = pd.read_csv(f"{dir_data_sifts}/uniprot_segments_observed.tsv.gz", sep="\t", header=1)
df_sifts

Unnamed: 0,PDB,CHAIN,SP_PRIMARY,RES_BEG,RES_END,PDB_BEG,PDB_END,SP_BEG,SP_END
0,113l,A,P00720,1,162,1,162,1,162
1,11gs,A,P09211,3,210,2,209,3,210
2,11gs,B,P09211,3,210,2,209,3,210
3,121p,A,P01112,1,166,1,166,1,166
4,133l,A,P61626,1,130,1,130,19,148
...,...,...,...,...,...,...,...,...,...
962405,7z7o,C,B1PNC0,69,227,71,229,61,219
962406,7z7o,D,B1PNC0,13,67,13,67,3,57
962407,7z7o,D,B1PNC0,68,68,69,69,59,60
962408,7z7o,D,B1PNC0,68,68,69,69,58,58


#### Subsetting dataframe to have only present disprot sequences

In [23]:
df_sifts = df_sifts.loc[df_sifts['SP_PRIMARY'].isin(dp_sequence['acc'].unique().tolist())]
df_sifts

Unnamed: 0,PDB,CHAIN,SP_PRIMARY,RES_BEG,RES_END,PDB_BEG,PDB_END,SP_BEG,SP_END
85,1aie,A,P04637,1,31,326,356,326,356
3448,1tup,A,P04637,1,196,94,289,94,289
3449,1tup,B,P04637,3,196,96,289,96,289
3450,1tup,C,P04637,2,196,95,289,95,289
5225,2bim,A,P04637,3,197,96,290,96,290
...,...,...,...,...,...,...,...,...,...
956126,7a4h,LO,P03045,37,65,37,65,35,62
956127,7a4h,LO,P03045,66,84,66,84,65,82
958615,2z5s,P,P04637,3,13,17,27,17,27
958616,2z5s,Q,P04637,3,13,17,27,17,27


In [24]:
df_sifts = df_sifts.apply(expand_region, database='sifts', axis=1)
df_sifts = df_sifts.explode("sp_position")
df_sifts

Unnamed: 0,PDB,CHAIN,SP_PRIMARY,RES_BEG,RES_END,PDB_BEG,PDB_END,SP_BEG,SP_END,sp_position
85,1aie,A,P04637,1,31,326,356,326,356,326
85,1aie,A,P04637,1,31,326,356,326,356,327
85,1aie,A,P04637,1,31,326,356,326,356,328
85,1aie,A,P04637,1,31,326,356,326,356,329
85,1aie,A,P04637,1,31,326,356,326,356,330
...,...,...,...,...,...,...,...,...,...,...
958617,2z5s,R,P04637,4,13,18,27,18,27,23
958617,2z5s,R,P04637,4,13,18,27,18,27,24
958617,2z5s,R,P04637,4,13,18,27,18,27,25
958617,2z5s,R,P04637,4,13,18,27,18,27,26


In [25]:
# Add wwPDB observed residues
# Suposition: aa is the same in uniprot and wwPDB
df_sequence_sifts = pd.merge( left     = dp_sequence
                            , right    = df_sifts
                            , left_on  = ['acc', 'seq_position']
                            , right_on = ['SP_PRIMARY', 'sp_position']
                            , how      = 'left' )
df_sequence_sifts

Unnamed: 0,disprot_id,acc,seq_position,seq_aa,PDB,CHAIN,SP_PRIMARY,RES_BEG,RES_END,PDB_BEG,PDB_END,SP_BEG,SP_END,sp_position
0,DP00005,P03045,1,M,6gov,N,P03045,4.0,110.0,1,107,1.0,107.0,1
1,DP00005,P03045,1,M,5lm7,F,P03045,6.0,86.0,1,81,1.0,81.0,1
2,DP00005,P03045,1,M,5lm7,N,P03045,6.0,87.0,1,82,1.0,82.0,1
3,DP00005,P03045,2,D,6gov,N,P03045,4.0,110.0,1,107,1.0,107.0,2
4,DP00005,P03045,2,D,5lm7,F,P03045,6.0,86.0,1,81,1.0,81.0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
117687,DP00086,P04637,393,D,6rl4,P,P04637,1.0,12.0,382,393,382.0,393.0,393
117688,DP00086,P04637,393,D,6rwi,P,P04637,1.0,12.0,382,393,382.0,393.0,393
117689,DP00086,P04637,393,D,6rwu,P,P04637,1.0,12.0,382,393,382.0,393.0,393
117690,DP00086,P04637,393,D,6sip,P,P04637,1.0,12.0,382,393,382.0,393.0,393


In [26]:
df_sequence_sifts = df_sequence_sifts[['disprot_id', 'acc', 'seq_position', 'seq_aa', 'sp_position']]
df_sequence_sifts.drop_duplicates(inplace=True)
df_sequence_sifts

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_sequence_sifts.drop_duplicates(inplace=True)


Unnamed: 0,disprot_id,acc,seq_position,seq_aa,sp_position
0,DP00005,P03045,1,M,1
3,DP00005,P03045,2,D,2
7,DP00005,P03045,3,A,3
12,DP00005,P03045,4,Q,4
17,DP00005,P03045,5,T,5
...,...,...,...,...,...
117567,DP00086,P04637,389,G,389
117595,DP00086,P04637,390,P,390
117623,DP00086,P04637,391,D,391
117650,DP00086,P04637,392,S,392


### Challenges

#### Disprot + challenge

In [27]:
challenge = "binding"
dp_entry_region_challenge = pd.merge( left     = dp_sequence
                                    , right    = dp_entry_regions.loc[dp_entry_regions['challenge'] == challenge]
                                    , left_on  = ["disprot_id", 'acc', 'seq_position']
                                    , right_on = ["disprot_id", 'acc', 'reg_position']
                                    , how      = 'left' )
dp_entry_region_challenge


Unnamed: 0,disprot_id,acc,seq_position,seq_aa,challenge,reg_position
0,DP00005,P03045,1,M,binding,1
1,DP00005,P03045,2,D,binding,2
2,DP00005,P03045,3,A,binding,3
3,DP00005,P03045,4,Q,binding,4
4,DP00005,P03045,5,T,binding,5
...,...,...,...,...,...,...
888,DP00086,P04637,389,G,binding,389
889,DP00086,P04637,390,P,,
890,DP00086,P04637,391,D,,
891,DP00086,P04637,392,S,,


In [29]:
with open(f"{dir_results}/{challenge}.fasta", "w+") as f:
    for disprot_id, df_g in dp_entry_region_challenge.groupby(by='disprot_id'):
        f.write(">{}\n{}\n{}\n".format( disprot_id
                                      , ''.join(df_g['seq_aa'])
                                      , ''.join(['0' if pd.isnull(val) else '1' for val in df_g['challenge']]) ))

#### DisProt + Sifts

In [30]:
with open(f"{dir_results}/pdb-atleast.fasta", "w+") as f:
    for disprot_id, df_g in df_sequence_sifts.groupby(by='disprot_id'):
        f.write(">{}\n{}\n{}\n".format( disprot_id
                                      , ''.join(df_g['seq_aa'])
                                      , ''.join(['0' if pd.isnull(val) else '1' for val in df_g['sp_position']])))

#### DisProt + Sifts + challenge

In [31]:
challenge = "binding"
dp_sequence_sifts_challenge = pd.merge( right     = dp_entry_region_challenge
                                      , left    = df_sequence_sifts
                                      , right_on  = ["disprot_id", 'seq_aa', 'acc', 'seq_position', 'reg_position']
                                      , left_on = ["disprot_id", 'seq_aa', 'acc', 'seq_position', 'sp_position']
                                      , how      = 'left' )
dp_sequence_sifts_challenge

Unnamed: 0,disprot_id,acc,seq_position,seq_aa,sp_position,challenge,reg_position
0,DP00005,P03045,1,M,1,binding,1
1,DP00005,P03045,2,D,2,binding,2
2,DP00005,P03045,3,A,3,binding,3
3,DP00005,P03045,4,Q,4,binding,4
4,DP00005,P03045,5,T,5,binding,5
...,...,...,...,...,...,...,...
888,DP00086,P04637,389,G,389,binding,389
889,DP00086,P04637,390,P,390,,
890,DP00086,P04637,391,D,391,,
891,DP00086,P04637,392,S,392,,


In [32]:
dp_sequence_sifts_challenge['result'] = '-'
dp_sequence_sifts_challenge.loc[dp_sequence_sifts_challenge['sp_position'].notnull(), 'result'] = '0' 
dp_sequence_sifts_challenge.loc[dp_sequence_sifts_challenge['challenge'].notnull(), 'result'] = '1'
dp_sequence_sifts_challenge

Unnamed: 0,disprot_id,acc,seq_position,seq_aa,sp_position,challenge,reg_position,result
0,DP00005,P03045,1,M,1,binding,1,1
1,DP00005,P03045,2,D,2,binding,2,1
2,DP00005,P03045,3,A,3,binding,3,1
3,DP00005,P03045,4,Q,4,binding,4,1
4,DP00005,P03045,5,T,5,binding,5,1
...,...,...,...,...,...,...,...,...
888,DP00086,P04637,389,G,389,binding,389,1
889,DP00086,P04637,390,P,390,,,0
890,DP00086,P04637,391,D,391,,,0
891,DP00086,P04637,392,S,392,,,0


In [34]:
with open(f"{dir_results}/pdb-binding-atleast.fasta", "w+") as f:
    for disprot_id, df_g in dp_sequence_sifts_challenge.groupby(by='disprot_id'):
        f.write(">{}\n{}\n{}\n".format( disprot_id
                                      , ''.join(df_g['seq_aa'])
                                      , ''.join(df_g['result'])))