### Uniprot KG Data Builder

In [3]:
import gzip
import pandas as pd
import json as json
import os


#### Read Json file (one file element at a time)

In [4]:
with open("data/output/all_uniprot_data.json", "r") as f:
    uniprot_data = json.load(f) 

- How amny data points are there? What are the keys?

In [5]:
print("Data Points: ",len(uniprot_data),"|",  "Keys:" , uniprot_data[0].keys())

Data Points:  569516 | Keys: dict_keys(['accession', 'name', 'gene', 'organism', 'sequence', 'uniprotId', 'ptm', 'references'])


- Lets print single data that includes PTM info- 

In [6]:
import pprint
pp = pprint.PrettyPrinter(indent=4)
pp.pprint(uniprot_data[24])

{   'accession': [   'P63261',
                     'A8K7C2',
                     'P02571',
                     'P14104',
                     'P99022',
                     'Q5U032',
                     'Q96E67'],
    'gene': 'ACTG1',
    'name': 'Actin, cytoplasmic 2',
    'organism': 'Homo sapiens',
    'ptm': [   {   'Description': 'N-acetylmethionine',
                   'Evidence': '17 25 26',
                   'Position': '1'},
               {   'Description': 'N-acetylglutamate; in Actin, cytoplasmic 2, '
                                  'N-terminally processed; partial',
                   'Evidence': '14 15 18 19 24 25 26 27',
                   'Position': '2'},
               {   'Description': 'Methionine (R)-sulfoxide',
                   'Evidence': '1',
                   'Position': '44'},
               {   'Description': 'Methionine (R)-sulfoxide',
                   'Evidence': '1',
                   'Position': '47'},
               {   'Description': 'Tele-

- What are other PTM including data?

In [7]:
for i,item in enumerate(uniprot_data):
    if item['ptm'] != []:
        print(item['ptm'])
        
    if i >50:
        break   


[{'Position': '239', 'Description': 'N6-(pyridoxal phosphate)lysine', 'Evidence': '1'}]
[{'Position': '1', 'Description': 'N-acetylmethionine', 'Evidence': '17 25 26'}, {'Position': '2', 'Description': 'N-acetylglutamate; in Actin, cytoplasmic 2, N-terminally processed; partial', 'Evidence': '14 15 18 19 24 25 26 27'}, {'Position': '44', 'Description': 'Methionine (R)-sulfoxide', 'Evidence': '1'}, {'Position': '47', 'Description': 'Methionine (R)-sulfoxide', 'Evidence': '1'}, {'Position': '73', 'Description': 'Tele-methylhistidine', 'Evidence': '16 19'}, {'Position': '84', 'Description': 'N6-methyllysine', 'Evidence': '11'}]
[{'Position': '49', 'Description': 'N-acetylvaline', 'Evidence': '12'}]
[{'Position': '328', 'Description': 'Phosphoserine', 'Evidence': '4'}]
[{'Position': '235', 'Description': 'N5-methylglutamine', 'Evidence': '1'}]
[{'Position': '2', 'Description': 'N-acetylserine', 'Evidence': '2'}, {'Position': '2', 'Description': 'Phosphoserine', 'Evidence': '2'}, {'Position

### Prepare subset of data with PTM record

First we select the PTM associated data and flatten the dictionary to create three columns seperately: Position, description, evicence

In [8]:
PTM_DATA = []
for item in uniprot_data:
    if item['ptm'] != []:
        data = {}
        data['accession'] = item['accession']
        data['gene'] = item['gene']
        data['name'] = item['name']
        data['organism'] = item['organism']
        data['sequence'] = item['sequence']
        data['uniprotId'] = item['uniprotId']
        

        idata = data.copy()
        
        for ptm in item['ptm']:
            try:
                idata['position'] = ptm['Position']
            except:
                idata['position'] = None
            try:
                idata['description'] = ptm['Description']
            except:
                idata['description'] = None
            try:
                idata['evidence'] = ptm['Evidence'].split(' ')
            except:
                idata['evidence'] = None

            PTM_DATA.append(idata.copy())

In [9]:
# with open('output/PTM_DATA.json', 'w') as f:
#     json.dump(PTM_DATA,f)

#with open('output/PTM_DATA.json', 'r') as f:
#     PTM_DATA = json.load(f)

- How many PTM data are found? What is the PTM data %?

In [10]:
len(PTM_DATA), (len(PTM_DATA)/len(uniprot_data))*100

(260688, 45.773604253436254)

In [11]:
df_ptm_flat = pd.DataFrame(PTM_DATA)

In [12]:
df_ptm_flat.head(2)

Unnamed: 0,accession,gene,name,organism,sequence,uniprotId,position,description,evidence
0,[Q5U4Q9],scly,Selenocysteine lyase,Xenopus tropicalis,MADAESQNGENHLPHKIYLDYNATTPPATEVVKAVEEALREAWGNP...,Q5U4Q9,239,N6-(pyridoxal phosphate)lysine,[1]
1,"[P63261, A8K7C2, P02571, P14104, P99022, Q5U03...",ACTG1,"Actin, cytoplasmic 2",Homo sapiens,MEEEIAALVIDNGSGMCKAGFAGDDAPRAVFPSIVGRPRHQGVMVG...,P63261,1,N-acetylmethionine,"[17, 25, 26]"


#### Create PTM Identifier from description:
WE want to create identifier from description column to match identifier from PTM vocab file


In [13]:
df_ptm_flat['Identifier'] = df_ptm_flat['description']\
        .apply(lambda x: x.split(';')[0].lower() if ";" in x else x.lower())

In [14]:
with open('data/ptm/idf2accession.json', 'r') as f:
    idf2accession = json.load(f)

df_ptm_flat['ptm_accession'] = df_ptm_flat['Identifier']\
        .apply(lambda x: idf2accession[x] if x in idf2accession else None)

In [15]:
df_ptm_flat.head(2)

Unnamed: 0,accession,gene,name,organism,sequence,uniprotId,position,description,evidence,Identifier,ptm_accession
0,[Q5U4Q9],scly,Selenocysteine lyase,Xenopus tropicalis,MADAESQNGENHLPHKIYLDYNATTPPATEVVKAVEEALREAWGNP...,Q5U4Q9,239,N6-(pyridoxal phosphate)lysine,[1],n6-(pyridoxal phosphate)lysine,
1,"[P63261, A8K7C2, P02571, P14104, P99022, Q5U03...",ACTG1,"Actin, cytoplasmic 2",Homo sapiens,MEEEIAALVIDNGSGMCKAGFAGDDAPRAVFPSIVGRPRHQGVMVG...,P63261,1,N-acetylmethionine,"[17, 25, 26]",n-acetylmethionine,PTM-0205


In [16]:
df_ptm_flat.columns

Index(['accession', 'gene', 'name', 'organism', 'sequence', 'uniprotId',
       'position', 'description', 'evidence', 'Identifier', 'ptm_accession'],
      dtype='object')

In [17]:
#260688
df_ptm_flat_dropna = df_ptm_flat.dropna(subset=['ptm_accession'])
len(df_ptm_flat_dropna), (len(df_ptm_flat_dropna)/len(df_ptm_flat))*100

(234981, 90.13878659547045)

In [18]:
df_ptm_flat_dropna['mod_res_id'] = df_ptm_flat_dropna['uniprotId'] \
        + "_" + df_ptm_flat_dropna['ptm_accession'] + "_" + df_ptm_flat_dropna['position'].astype(str)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_ptm_flat_dropna['mod_res_id'] = df_ptm_flat_dropna['uniprotId'] \


In [19]:
df_ptm_flat_dropna.head(5)

Unnamed: 0,accession,gene,name,organism,sequence,uniprotId,position,description,evidence,Identifier,ptm_accession,mod_res_id
1,"[P63261, A8K7C2, P02571, P14104, P99022, Q5U03...",ACTG1,"Actin, cytoplasmic 2",Homo sapiens,MEEEIAALVIDNGSGMCKAGFAGDDAPRAVFPSIVGRPRHQGVMVG...,P63261,1,N-acetylmethionine,"[17, 25, 26]",n-acetylmethionine,PTM-0205,P63261_PTM-0205_1
2,"[P63261, A8K7C2, P02571, P14104, P99022, Q5U03...",ACTG1,"Actin, cytoplasmic 2",Homo sapiens,MEEEIAALVIDNGSGMCKAGFAGDDAPRAVFPSIVGRPRHQGVMVG...,P63261,2,"N-acetylglutamate; in Actin, cytoplasmic 2, N-...","[14, 15, 18, 19, 24, 25, 26, 27]",n-acetylglutamate,PTM-0202,P63261_PTM-0202_2
5,"[P63261, A8K7C2, P02571, P14104, P99022, Q5U03...",ACTG1,"Actin, cytoplasmic 2",Homo sapiens,MEEEIAALVIDNGSGMCKAGFAGDDAPRAVFPSIVGRPRHQGVMVG...,P63261,73,Tele-methylhistidine,"[16, 19]",tele-methylhistidine,PTM-0290,P63261_PTM-0290_73
6,"[P63261, A8K7C2, P02571, P14104, P99022, Q5U03...",ACTG1,"Actin, cytoplasmic 2",Homo sapiens,MEEEIAALVIDNGSGMCKAGFAGDDAPRAVFPSIVGRPRHQGVMVG...,P63261,84,N6-methyllysine,[11],n6-methyllysine,PTM-0194,P63261_PTM-0194_84
7,"[Q38933, A8MR53, Q39145]",LCY1,"Lycopene beta cyclase, chloroplastic",Arabidopsis thaliana,MDTLLKTPNKLDFFIPQFHGFERLCSNNPYHSRVRLGVKKRAIKIV...,Q38933,49,N-acetylvaline,[12],n-acetylvaline,PTM-0210,Q38933_PTM-0210_49


In [20]:
# save the dataframe as csv file
#df_ptm_flat_dropna.to_csv('kgdata/ptm_flat_dropna.csv', index=False)

In [21]:
# save the dataframe as json file
df_ptm_flat_dropna.to_json('data/kgdata/ptm_flat_dropna.json', orient='records')

In [22]:
df_ptm_flat_dropna.columns

Index(['accession', 'gene', 'name', 'organism', 'sequence', 'uniprotId',
       'position', 'description', 'evidence', 'Identifier', 'ptm_accession',
       'mod_res_id'],
      dtype='object')

# Rough

In [23]:
# explode the dictionaries into separate rows
#df_test = pd.read_csv('output/ptm_data.csv')
#df_test = df_test.explode('ptm')
#df_test.head(5)

In [24]:
#convert the dictionaries into separate columns
#df_test = pd.read_csv('output/ptm_data.csv')
#df_test[['Position', 'Description', 'Evidence']] = pd.json_normalize(df_test['ptm'])