In [1]:
#
# Import modules
#

import numpy as np
import os
import pandas as pd
import requests
import time

In [2]:
#
# Set user input variables
#

pdmTable_path = r'C:\Users\rbarreror\home\Proteomics\GroupTools\pdmAPPRIS\PDMTable_1_Heart_PDMTable1.txt'
qcol = 'q'
ga = 'GRCm39'
organism = 'mus_musculus'

In [3]:
#
# Set constants
#

domain_table = 'appris_method.feather'

genome_assembly = {
    'homo_sapiens': 'GRCh38',
    'mus_musculus': 'GRCm39'
}

In [54]:
# Get Genome Assembly
if not organism in list(genome_assembly.keys()):
    print('** Organism not recognized or not available')

else:
    ga = genome_assembly[organism]

In [71]:
# Read pdm Table
df = pd.read_csv(pdmTable_path, sep='\t')

In [4]:
# Send request
ids = ','.join(df['q'].drop_duplicates().to_list())
fromdb = 'UniProtKB_AC-ID'
todb = 'Ensembl_Transcript'

url_map = 'https://rest.uniprot.org/idmapping/run'

r_url = requests.post(url_map, {
    'from': fromdb,
    'to': todb,
    'ids': ids
})

if not r_url.ok:
    print(f'** ERROR: {r_url.json()}')

else:
    print(f'** Mapping {fromdb} to {todb}')


** Mapping UniProtKB_AC-ID to Ensembl_Transcript


In [5]:
# Get Ensembl id
while True:
    url_get = f'https://rest.uniprot.org/idmapping/stream/{r_url.json()["jobId"]}?compressed=false&format=json'
    r_mapped = requests.get(url_get)
    
    if r_mapped.ok:
        r_mapped = r_mapped.json()['results']
        break
    else:
        print(f'** Waiting map...')
        time.sleep(5)

** Waiting map...
** Waiting map...
** Waiting map...
** Waiting map...


In [7]:
# Create dataframe mapping UniProt to Ensemble
mapdf = pd.DataFrame(
    [(i['from'], i['to']) for i in r_mapped],
    columns=['from', 'to']
)

In [8]:
# Get domain information
domdf = pd.read_feather(os.path.join('data', ga, domain_table))

In [11]:
# Relation table q -> domain
q_dom_df = pd.merge(
    mapdf,
    domdf,
    how='left',
    left_on='to',
    right_on='transcript_id'
)

In [50]:
# Dataframe containing pdm-q-pos (some pdm-q have the different pos)
pdm_qpos = df.loc[:, ['pdm', 'q']]
pdm_qpos['be'] = [list(zip(*[i.split(';'),j.split(';')])) for i,j in zip(df['b'].to_list(), df['e'].to_list())]
pdm_qpos = pdm_qpos.explode('be')
pdm_qpos['b'] = list(zip(*pdm_qpos['be'].to_list()))[0]
pdm_qpos['e'] = list(zip(*pdm_qpos['be'].to_list()))[1]
pdm_qpos = pdm_qpos.astype({'b':'int64', 'e':'int64'})

In [51]:
pdm_qpos_dom = pd.merge(
    pdm_qpos,
    q_dom_df,
    how='left',
    left_on='q',
    right_on='from'
)

In [54]:
# check if b or e is in (pep_start, pep_end) interval (it means intersection)
pdm_qpos_dom['intersect'] = np.logical_or(
    np.logical_and(
        pdm_qpos_dom['b']>pdm_qpos_dom['pep_start'],
        pdm_qpos_dom['b']<pdm_qpos_dom['pep_end']
    ),
    np.logical_and(
        pdm_qpos_dom['e']>pdm_qpos_dom['pep_start'],
        pdm_qpos_dom['e']<pdm_qpos_dom['pep_end']
    )
)


In [57]:
pdm_qpos_dom['be_domain'] = [(i,j) for i,j in zip(pdm_qpos_dom['pep_start'].to_list(), pdm_qpos_dom['pep_end'].to_list())]

In [68]:
pdm_dom = pdm_qpos_dom.loc[pdm_qpos_dom['intersect'], ['pdm', 'gene_id', 'hmm_name', 'be_domain']].drop_duplicates().groupby(['pdm','gene_id']).agg(list).reset_index()

In [72]:
df = pd.merge(
    df,
    pdm_dom,
    how='left',
    on='pdm'
)

In [76]:
os.path.splitext(pdmTable_path)[0]+'_APPRIS'+os.path.splitext(pdmTable_path)[1]

'C:\\Users\\rbarreror\\home\\Proteomics\\GroupTools\\pdmAPPRIS\\PDMTable_1_Heart_PDMTable1_APPRIS.txt'

In [77]:
outfile = os.path.splitext(pdmTable_path)[0]+'_APPRIS'+os.path.splitext(pdmTable_path)[1]
df.to_csv(os.path.splitext(pdmTable_path)[0]+'_APPRIS.txt', sep='\t', index=False)

In [73]:
df

Unnamed: 0,p,pdm,q,qFreq,pFreq,pd,d,Missing_Cleavage,Truncated,best_scan,...,qfFreq,A,M,L,N,qdNA,qNA,gene_id,hmm_name,be_domain
0,IIIVGK,IIIVGK_0.0002,P62342,1.0,1.0,IIIVGK:0.0002,0.000200,0.0,1.0,RH_Heart_TMTHF_FR5WO-98646-2,...,1.0,U,,,,P62342:0.0002::U,P62342::U,ENSMUSG00000075700,[Rdx],"[(42.0, 178.0)]"
1,SIAAEPIQGPYK,SIAAEPIQGPYK_0.0002,P28665,130.0,5.0,SIAAEPIQGPYK:0.0002,0.000200,0.0,1.0,RH_Heart_TMTHF_FR3WO-79252-3,...,5.0,U,,,,P28665:0.0002::U,P28665::U,ENSMUSG00000059908,[A2M_N],"[(129.0, 221.0)]"
2,AGGVEHQQIIDIAQK,AGGVEHQQIIDIAQK_0.0002,Q9CZ13,730.0,1.0,AGGVEHQQIIDIAQK:0.0002,0.000200,0.0,1.0,RH_Heart_TMTHF_FR5WO-75614-3,...,45.0,U,,,,Q9CZ13:0.0002::U,Q9CZ13::U,ENSMUSG00000025651,[Peptidase_M16_C],"[(211.0, 395.0)]"
3,ASYCIEHGDIEIAAK,ASYCIEHGDIEIAAK_0.0002,Q8CAQ8,485.0,1.0,ASYCIEHGDIEIAAK:0.0002,0.000200,0.0,1.0,RH_Heart_TMTHF_FR6WO-82593-3,...,1.0,U,,,,Q8CAQ8:0.0002::U,Q8CAQ8::U,ENSMUSG00000052337,"[Mitofilin, Mitofilin, Mitofilin]","[(43.0, 745.0), (43.0, 734.0), (43.0, 697.0)]"
4,AVEEQYSCEYGSGR,AVEEQYSCEYGSGR_0.0002,G5E902,415.0,12.0,AVEEQYSCEYGSGR:0.0002,0.000200,0.0,1.0,RH_Heart_TMTHF_FR3WO-49277-2,...,12.0,U,,,,G5E902:0.0002::U,G5E902::U,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31018,TYFPHFDVSHGSAQVK,TYFP[98.989064]HFDVSHGSAQVK,Q91VB8,3139.0,509.0,TYFPHFDVSHGSAQVK:98.989064,98.989064,0.0,0.0,RH_Heart_TMTHF_FR1WO-92752-3,...,533.0,F,6.0,11.0,47,Q91VB8:98.989064:47:F,Q91VB8:47:F,ENSMUSG00000069917,[Globin],"[(7.0, 107.0)]"
31019,TYFPHFDVSHGSAQVK,TYFP[98.989064]HFDVSHGSAQVK,Q91VB8,3139.0,509.0,TYFPHFDVSHGSAQVK:98.989064,98.989064,0.0,0.0,RH_Heart_TMTHF_FR1WO-92752-3,...,533.0,F,6.0,11.0,47,Q91VB8:98.989064:47:F,Q91VB8:47:F,ENSMUSG00000069919,[Globin],"[(7.0, 107.0)]"
31020,AAVDAGFVPNDMQVGQTGK,AAVDAGFVPNDMQ[98.989064]VGQTGK,Q99LC5,1241.0,406.0,AAVDAGFVPNDMQVGQTGK:98.989064,98.989064,0.0,0.0,RH_Heart_TMTHF_FR3WO-81696-3,...,406.0,Q,13.0,7.0,262,Q99LC5:98.989064:262:Q,Q99LC5:262:Q,ENSMUSG00000032314,[ETF_alpha],"[(211.0, 293.0)]"
31021,TYFPHFDVSHGSAQVK,TYFPHF[100.015501]DVSHGSAQVK,Q91VB8,3139.0,509.0,TYFPHFDVSHGSAQVK:100.015501,100.015501,0.0,0.0,RH_Heart_TMTHF_FR1WO-91683-3,...,533.0,F,6.0,11.0,47,Q91VB8:100.015501:47:F,Q91VB8:47:F,ENSMUSG00000069917,[Globin],"[(7.0, 107.0)]"


In [58]:
pdm_qpos_dom

Unnamed: 0,pdm,q,be,b,e,from,to,evalue,gene_id,hmm_name,pep_end,pep_start,transcript_id,intersect,be_domain
0,IIIVGK_0.0002,P62342,"(99, 104)",99,104,P62342,ENSMUST00000107924,6.800000e-16,ENSMUSG00000075700,Rdx,178.0,42.0,ENSMUST00000107924,True,"(42.0, 178.0)"
1,SIAAEPIQGPYK_0.0002,P28665,"(193, 204)",193,204,P28665,ENSMUST00000032228,1.700000e-18,ENSMUSG00000059908,A2M_N,221.0,129.0,ENSMUST00000032228,True,"(129.0, 221.0)"
2,SIAAEPIQGPYK_0.0002,P28665,"(193, 204)",193,204,P28665,ENSMUST00000032228,2.400000e-27,ENSMUSG00000059908,A2M_N_2,599.0,458.0,ENSMUST00000032228,False,"(458.0, 599.0)"
3,SIAAEPIQGPYK_0.0002,P28665,"(193, 204)",193,204,P28665,ENSMUST00000032228,3.400000e-28,ENSMUSG00000059908,A2M,830.0,740.0,ENSMUST00000032228,False,"(740.0, 830.0)"
4,SIAAEPIQGPYK_0.0002,P28665,"(193, 204)",193,204,P28665,ENSMUST00000032228,6.900000e-16,ENSMUSG00000059908,Thiol-ester_cl,992.0,965.0,ENSMUST00000032228,False,"(965.0, 992.0)"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
106121,TYFP[98.989064]HFDVSHGSAQVK,Q91VB8,"(42, 57)",42,57,Q91VB8,ENSMUST00000093209,4.500000e-29,ENSMUSG00000069919,Globin,107.0,7.0,ENSMUST00000093209,True,"(7.0, 107.0)"
106122,AAVDAGFVPNDMQ[98.989064]VGQTGK,Q99LC5,"(250, 268)",250,268,Q99LC5,ENSMUST00000034866,3.300000e-41,ENSMUSG00000032314,ETF,184.0,22.0,ENSMUST00000034866,False,"(22.0, 184.0)"
106123,AAVDAGFVPNDMQ[98.989064]VGQTGK,Q99LC5,"(250, 268)",250,268,Q99LC5,ENSMUST00000034866,8.400000e-38,ENSMUSG00000032314,ETF_alpha,293.0,211.0,ENSMUST00000034866,True,"(211.0, 293.0)"
106124,TYFPHF[100.015501]DVSHGSAQVK,Q91VB8,"(42, 57)",42,57,Q91VB8,ENSMUST00000093207,4.500000e-29,ENSMUSG00000069917,Globin,107.0,7.0,ENSMUST00000093207,True,"(7.0, 107.0)"


In [27]:
[list(zip(*[i.split(';'),j.split(';')])) for i,j in zip(df['b'].to_list(), df['e'].to_list())]

[[('99', '104')],
 [('193', '204')],
 [('234', '248')],
 [('693', '707')],
 [('46', '59')],
 [('473', '485')],
 [('333', '347')],
 [('81', '95')],
 [('42', '55')],
 [('44', '54')],
 [('123', '138')],
 [('71', '81')],
 [('347', '362')],
 [('10', '21')],
 [('234', '248')],
 [('33', '48')],
 [('63', '69')],
 [('60', '73')],
 [('22', '32')],
 [('304', '314')],
 [('410', '422')],
 [('541', '553')],
 [('81', '96')],
 [('31', '41')],
 [('13', '24')],
 [('14', '42')],
 [('715', '720')],
 [('38', '51')],
 [('84', '97')],
 [('248', '264')],
 [('87', '96')],
 [('32', '48')],
 [('62', '73')],
 [('70', '80')],
 [('79', '94')],
 [('54', '65')],
 [('61', '73')],
 [('49', '62')],
 [('31', '48')],
 [('21', '31')],
 [('40', '50')],
 [('35', '43')],
 [('136', '145')],
 [('637', '649')],
 [('37', '51')],
 [('83', '97')],
 [('22', '31')],
 [('20', '32')],
 [('34', '45')],
 [('71', '86')],
 [('56', '67')],
 [('398', '415')],
 [('46', '65')],
 [('19', '32')],
 [('380', '390')],
 [('14', '26')],
 [('135', '14

In [21]:
[i for i in df['e'].to_list() if ';' in i]

['322;421;520;586;718;949',
 '322;421;520;586;718;949',
 '18;133',
 '105;220',
 '3356;3484;4456',
 '245;344;443;476;509;608;641;839',
 '539;712;1048;1219;1392;1565',
 '223;454;487;619;817',
 '114;229',
 '481;490;499;508;553',
 '1949;2016;2083;2150',
 '355;652;850',
 '1074;1478',
 '741;785',
 '2904;3612;3866;3992;4584',
 '674;740;773;938',
 '1286;1412;1859;2262;2531;2659;2785;2913;3493;3621;3875;4001;4129;4593',
 '1816;2219',
 '628;654',
 '562;571;580',
 '322;421;520;586;718;949',
 '1538;1664;3365;4465',
 '774;1403;1850;2253;2522;2776;3738;4120',
 '774;1403;1850;2253;2522;2776;3738;4120',
 '105;220',
 '105;220',
 '674;740;773;938',
 '774;1403;1850;2253;2522;2776;3738;4120',
 '105;220',
 '105;220',
 '105;220',
 '1949;2016;2083;2150',
 '105;220',
 '223;454;487;619;817',
 '410;575;905',
 '18;133',
 '18;133',
 '18;133',
 '245;344;443;476;509;608;641;839',
 '1949;2016;2083;2150',
 '774;1403;1850;2253;2522;2776;3738;4120',
 '245;344;443;476;509;608;641;839',
 '539;712;1048;1219;1392;1565',
 '

In [12]:
q_dom_df

Unnamed: 0,from,to,evalue,gene_id,hmm_name,pep_end,pep_start,transcript_id
0,P62342,ENSMUST00000107924,6.800000e-16,ENSMUSG00000075700,Rdx,178.0,42.0,ENSMUST00000107924
1,P28665,ENSMUST00000032228,1.700000e-18,ENSMUSG00000059908,A2M_N,221.0,129.0,ENSMUST00000032228
2,P28665,ENSMUST00000032228,2.400000e-27,ENSMUSG00000059908,A2M_N_2,599.0,458.0,ENSMUST00000032228
3,P28665,ENSMUST00000032228,3.400000e-28,ENSMUSG00000059908,A2M,830.0,740.0,ENSMUST00000032228
4,P28665,ENSMUST00000032228,6.900000e-16,ENSMUSG00000059908,Thiol-ester_cl,992.0,965.0,ENSMUST00000032228
...,...,...,...,...,...,...,...,...
8429,A0A0R4IZX1,ENSMUST00000002327,5.800000e-19,ENSMUSG00000002257,PH,312.0,217.0,ENSMUST00000002327
8430,P51860,ENSMUST00000121720,4.600000e-85,ENSMUSG00000082229,NAP,411.0,111.0,ENSMUST00000121720
8431,E9QK89,ENSMUST00000082337,1.300000e-11,ENSMUSG00000061607,FHA,123.0,55.0,ENSMUST00000082337
8432,E9QK89,ENSMUST00000082337,3.800000e-08,ENSMUSG00000061607,RTT107_BRCT_5,1587.0,1499.0,ENSMUST00000082337


In [13]:
df.loc[:, ['pdm', 'q', 'b', 'e']]

Unnamed: 0,pdm,q,b,e
0,IIIVGK_0.0002,P62342,99,104
1,SIAAEPIQGPYK_0.0002,P28665,193,204
2,AGGVEHQQIIDIAQK_0.0002,Q9CZ13,234,248
3,ASYCIEHGDIEIAAK_0.0002,Q8CAQ8,693,707
4,AVEEQYSCEYGSGR_0.0002,G5E902,46,59
...,...,...,...,...
30043,TYFPHF[98.989064]DVSHGSAQVK,Q91VB8,42,57
30044,HSGDFGADAQGA[98.989064]MSK,P04247,120,134
30045,TYFP[98.989064]HFDVSHGSAQVK,Q91VB8,42,57
30046,AAVDAGFVPNDMQ[98.989064]VGQTGK,Q99LC5,250,268


In [8]:
domdf

Unnamed: 0,evalue,gene_id,hmm_name,pep_end,pep_start,transcript_id
0,3.6e-36,ENSMUSG00000042046,Pkinase_Tyr,897,652,ENSMUST00000045110
1,2.7e-27,ENSMUSG00000042046,Pkinase,820,653,ENSMUST00000188389
2,0.00000019,ENSMUSG00000070524,Ig_2,100,22,ENSMUST00000094337
3,0.000000000033,ENSMUSG00000070524,Ig_2,190,105,ENSMUST00000094337
4,4.4e-42,ENSMUSG00000048495,Cupin_8,253,17,ENSMUST00000162686
...,...,...,...,...,...,...
110001,8.8e-41,ENSMUSG00000064357,ATP-synt_A,222,19,ENSMUST00000082408
110002,8.6e-105,ENSMUSG00000064341,NADHdh,303,5,ENSMUST00000082392
110003,0.00000000000022,ENSMUSG00000064367,Proton_antipo_N,121,66,ENSMUST00000082418
110004,2.8e-74,ENSMUSG00000064367,Proton_antipo_M,418,134,ENSMUST00000082418


In [9]:
mapdf

Unnamed: 0,from,to
0,P62342,ENSMUSG00000075700
1,P28665,ENSMUSG00000059908
2,Q9CZ13,ENSMUSG00000025651
3,Q8CAQ8,ENSMUSG00000052337
4,G5E902,ENSMUSG00000061904
...,...,...
2721,A0A1D5RLV3,ENSMUSG00000055567
2722,A0A0R4IZX1,ENSMUSG00000002257
2723,P51860,ENSMUSG00000082229
2724,E9QK89,ENSMUSG00000061607


In [59]:
# Get domain information from APPRIS
toids = mapdf['to'].drop_duplicates().to_list()

url_appris_pre = f'https://apprisws.bioinfo.cnio.es/rest/exporter/id/{organism}/'
url_appris_suf = f'?methods=spade&format=json&as={ga}&sc=ensembl&ds=e105v46'

r_appris = [requests.get(url_appris_pre+i+url_appris_suf) for i in toids[:10]]

In [94]:
import json
json.loads('{"a":"aa"}')

{'a': 'aa'}

In [98]:
[
    m for m in 
    [
        (k['gene_id'], k['transcript_id'], *[l.split(':')[1] for l in k['note'].split(',')]) for j in
        [
            i.json() for i in r_appris if i.ok # get response from appris
        ]
        for k in j if all(np.isin(['gene_id', 'transcript_id', 'note'], list(k.keys())))
    ] if len(m)==6
]

[('ENSMUSG00000075700', 'ENSMUST00000107924', 'Rdx', '6.8e-16', '42', '178'),
 ('ENSMUSG00000059908',
  'ENSMUST00000032228',
  'A2M_N',
  '1.7e-18',
  '129',
  '221'),
 ('ENSMUSG00000059908',
  'ENSMUST00000032228',
  'A2M_N_2',
  '2.4e-27',
  '458',
  '599'),
 ('ENSMUSG00000059908', 'ENSMUST00000032228', 'A2M', '3.4e-28', '740', '830'),
 ('ENSMUSG00000059908',
  'ENSMUST00000032228',
  'Thiol-ester_cl',
  '6.9e-16',
  '965',
  '992'),
 ('ENSMUSG00000059908',
  'ENSMUST00000032228',
  'A2M_comp',
  '2.3e-91',
  '1013',
  '1267'),
 ('ENSMUSG00000059908',
  'ENSMUST00000032228',
  'A2M_recep',
  '2.3e-26',
  '1378',
  '1465'),
 ('ENSMUSG00000025651',
  'ENSMUST00000194047',
  'Peptidase_M16',
  '4.2e-53',
  '41',
  '187'),
 ('ENSMUSG00000025651',
  'ENSMUST00000026743',
  'Peptidase_M16',
  '4.2e-53',
  '58',
  '204'),
 ('ENSMUSG00000025651',
  'ENSMUST00000026743',
  'Peptidase_M16_C',
  '9e-32',
  '211',
  '395'),
 ('ENSMUSG00000025651',
  'ENSMUST00000194469',
  'Peptidase_M16_C',
  

In [None]:
','.join(df[qcol].drop_duplicates().to_list())

In [27]:
r = requests.post('https://rest.uniprot.org/idmapping/run', {'from':'UniProtKB_AC-ID', 'to':'Ensembl', 'ids':','.join(df[qcol].drop_duplicates().to_list())})

In [28]:
r.json()['jobId']

'c66f27cd4439339158c95f5cb3945e6bdf75ffc9'

In [29]:
r2 = requests.get(f'https://rest.uniprot.org/idmapping/stream/{r.json()["jobId"]}?compressed=false&format=json')

In [34]:
r2.ok

False

In [30]:
r2.json()

{'url': 'http://rest.uniprot.org/idmapping/stream/c66f27cd4439339158c95f5cb3945e6bdf75ffc9',
 'messages': ['Resource not found']}

In [8]:
df['q'].drop_duplicates().to_list()

['P62342',
 'P28665',
 'Q9CZ13',
 'Q8CAQ8',
 'G5E902',
 'P38647',
 'Q00898',
 'P04247',
 'Q91VB8',
 'Q8BH95',
 'Q99KI0',
 'Q9WUM5',
 'P23242',
 'Q9QY93',
 'Q8BW75',
 'P19536',
 'P10649',
 'A2A439',
 'A0A0R4J0I1',
 'P56480',
 'P23953',
 'Q8QZS1',
 'Q9CRB9',
 'A2AL85',
 'Q3TA68',
 'P12787',
 'A0A0R4J0X7',
 'A8DUK4',
 'Q03265',
 'Q9ERI2',
 'O35943',
 'Q9CXW2',
 'Q5SUC9',
 'P70404',
 'O35459',
 'Q8K370',
 'Q9DCB8',
 'E9QPD7',
 'P07310',
 'A6ZI44',
 'Q8CC88',
 'Q9DCM0',
 'Q60936',
 'P99024',
 'Q8BIJ6',
 'Q80Y14',
 'Q9JHI5',
 'Q9D0S9',
 'Q6P1F6',
 'P47857',
 'O88492',
 'P17182',
 'P48962',
 'P08249',
 'Q9D1A2',
 'Q9CQZ5',
 'P08228',
 'P21614',
 'Q99JY0',
 'Q91ZA3',
 'P07724',
 'P20152',
 'P19096',
 'O35129',
 'P45952',
 'O35639',
 'P60710',
 'Q9JI91',
 'Q9D020',
 'P70670',
 'Q9Z0V7',
 'J3QMG3',
 'Q8K411',
 'P62141',
 'Q8CG76',
 'P70398',
 'Q9CPV4',
 'Q9WUB3',
 'A0A0R4J083',
 'Q8K2B3',
 'P07309',
 'Q9D328',
 'Q9DCW4',
 'Q60932',
 'P48787',
 'O35887',
 'Q8R4N0',
 'P35564',
 'P63017',
 'A2AKU9'