### Uniprot KG Data Builder

In [1]:
import gzip
import pandas as pd
import json as json
import os


#### Read Json file (one file element at a time)

In [3]:
with open("data/output/all_uniprot_data.json", "r") as f:
    uniprot_data = json.load(f) 

- How amny data points are there? What are the keys?

In [13]:
print("Data Points: ",len(uniprot_data),"|",  "Keys:" , uniprot_data[0].keys())

Data Points:  569516 | Keys: dict_keys(['accession', 'name', 'gene', 'organism', 'sequence', 'uniprotId', 'ptm', 'references'])


### Prepare subset of data with PTM record

First we select the PTM associated data and flatten the dictionary to create three columns seperately: Position, description, evicence

In [24]:
REF_DATA = []
for item in uniprot_data:
    if item['ptm'] != []:
        data = {}
        data['gene'] = item['gene']
        data['name'] = item['name']
        data['organism'] = item['organism']
        data['uniprotId'] = item['uniprotId']
       
        # create key to reference mapping
        key2ref = {}
        for ref in item['references']:
            key2ref[ref['key']] = ref

        
        for ptm in item['ptm']:
            #print(ptm)
            idata = data.copy()
            try:
                idata['position'] = ptm['Position']
            except:
                idata['position'] = None
            try:
                idata['description'] = ptm['Description']
            except:
                idata['description'] = None
            try:
                idata['evidence'] = ptm['Evidence']
            except:
                idata['evidence'] = None

            if idata['evidence'] != None:

                # get reference keys by splitting the evidences string
                ref_keys = ptm['Evidence'].split(' ')

                # apply key2fer mapping to get reference data
                modified_residue_ref = [key2ref[ref] for ref in ref_keys if ref in key2ref.keys()]

                
                for ref_item in modified_residue_ref:
                    #print(ref_item)
                    idata_ref = idata.copy()
                    if 'pubmedId' in ref_item.keys():
                        idata_ref['pubmedId'] = ref_item['pubmedId']
                        idata_ref['journal'] = ref_item['journal']
                        idata_ref['title'] = ref_item['title']
                        idata_ref['date'] = ref_item['date']
                    else:
                        idata_ref['pubmedId'] = None
                        idata_ref['journal'] = None
                        idata_ref['title'] = None
                        idata_ref['date'] = None
                        
                    REF_DATA.append(idata_ref)

In [25]:
REF_DATA

[{'gene': 'scly',
  'name': 'Selenocysteine lyase',
  'organism': 'Xenopus tropicalis',
  'uniprotId': 'Q5U4Q9',
  'position': '239',
  'description': 'N6-(pyridoxal phosphate)lysine',
  'evidence': '1',
  'pubmedId': None,
  'journal': None,
  'title': None,
  'date': None},
 {'gene': 'ACTG1',
  'name': 'Actin, cytoplasmic 2',
  'organism': 'Homo sapiens',
  'uniprotId': 'P63261',
  'position': '1',
  'description': 'N-acetylmethionine',
  'evidence': '17 25 26',
  'pubmedId': '25944712',
  'journal': 'Proteomics',
  'title': None,
  'date': '2015'},
 {'gene': 'ACTG1',
  'name': 'Actin, cytoplasmic 2',
  'organism': 'Homo sapiens',
  'uniprotId': 'P63261',
  'position': '1',
  'description': 'N-acetylmethionine',
  'evidence': '17 25 26',
  'pubmedId': '16773128',
  'journal': 'Eur. J. Hum. Genet.',
  'title': None,
  'date': '2006'},
 {'gene': 'ACTG1',
  'name': 'Actin, cytoplasmic 2',
  'organism': 'Homo sapiens',
  'uniprotId': 'P63261',
  'position': '1',
  'description': 'N-acety

In [26]:
with open('data/output/REF_DATA.json', 'w') as f:
     json.dump(REF_DATA,f)

#with open('output/REF_DATA.json', 'r') as f:
#     PTM_DATA = json.load(f)

- How many PTM data are found? What is the PTM data %?

In [27]:
df_modres_ref = pd.DataFrame(REF_DATA)

In [28]:
df_modres_ref.head(10)

Unnamed: 0,gene,name,organism,uniprotId,position,description,evidence,pubmedId,journal,title,date
0,scly,Selenocysteine lyase,Xenopus tropicalis,Q5U4Q9,239,N6-(pyridoxal phosphate)lysine,1,,,,
1,ACTG1,"Actin, cytoplasmic 2",Homo sapiens,P63261,1,N-acetylmethionine,17 25 26,25944712.0,Proteomics,,2015.0
2,ACTG1,"Actin, cytoplasmic 2",Homo sapiens,P63261,1,N-acetylmethionine,17 25 26,16773128.0,Eur. J. Hum. Genet.,,2006.0
3,ACTG1,"Actin, cytoplasmic 2",Homo sapiens,P63261,1,N-acetylmethionine,17 25 26,18804074.0,J. Genet. Genomics,,2008.0
4,ACTG1,"Actin, cytoplasmic 2",Homo sapiens,P63261,2,"N-acetylglutamate; in Actin, cytoplasmic 2, N-...",14 15 18 19 24 25 26 27,22223895.0,Mol. Cell. Proteomics,,2012.0
5,ACTG1,"Actin, cytoplasmic 2",Homo sapiens,P63261,2,"N-acetylglutamate; in Actin, cytoplasmic 2, N-...",14 15 18 19 24 25 26 27,22814378.0,Proc. Natl. Acad. Sci. U.S.A.,,2012.0
6,ACTG1,"Actin, cytoplasmic 2",Homo sapiens,P63261,2,"N-acetylglutamate; in Actin, cytoplasmic 2, N-...",14 15 18 19 24 25 26 27,26228148.0,Science,,2015.0
7,ACTG1,"Actin, cytoplasmic 2",Homo sapiens,P63261,2,"N-acetylglutamate; in Actin, cytoplasmic 2, N-...",14 15 18 19 24 25 26 27,30028079.0,FEBS J.,,2018.0
8,ACTG1,"Actin, cytoplasmic 2",Homo sapiens,P63261,2,"N-acetylglutamate; in Actin, cytoplasmic 2, N-...",14 15 18 19 24 25 26 27,14684684.0,J. Med. Genet.,,2003.0
9,ACTG1,"Actin, cytoplasmic 2",Homo sapiens,P63261,2,"N-acetylglutamate; in Actin, cytoplasmic 2, N-...",14 15 18 19 24 25 26 27,16773128.0,Eur. J. Hum. Genet.,,2006.0


#### Create PTM Identifier from description:
WE want to create identifier from description column to match identifier from PTM vocab file


In [29]:
df_modres_ref['Identifier'] = df_modres_ref['description']\
        .apply(lambda x: x.split(';')[0].lower() if ";" in x else x.lower())

In [31]:
with open('data/ptm/idf2accession.json', 'r') as f:
    idf2accession = json.load(f)

df_modres_ref['ptm_accession'] = df_modres_ref['Identifier']\
        .apply(lambda x: idf2accession[x] if x in idf2accession else None)

In [32]:
df_modres_ref.head(2)

Unnamed: 0,gene,name,organism,uniprotId,position,description,evidence,pubmedId,journal,title,date,Identifier,ptm_accession
0,scly,Selenocysteine lyase,Xenopus tropicalis,Q5U4Q9,239,N6-(pyridoxal phosphate)lysine,1,,,,,n6-(pyridoxal phosphate)lysine,
1,ACTG1,"Actin, cytoplasmic 2",Homo sapiens,P63261,1,N-acetylmethionine,17 25 26,25944712.0,Proteomics,,2015.0,n-acetylmethionine,PTM-0205


In [33]:
df_modres_ref = df_modres_ref.dropna(subset=['ptm_accession'])
df_modres_ref = df_modres_ref.dropna(subset=['pubmedId'])

In [35]:
df_modres_ref.columns

Index(['gene', 'name', 'organism', 'uniprotId', 'position', 'description',
       'evidence', 'pubmedId', 'journal', 'title', 'date', 'Identifier',
       'ptm_accession'],
      dtype='object')

In [36]:
df_modres_ref.shape

(161607, 13)

In [37]:
df_modres_ref['mod_res_id'] = df_modres_ref['uniprotId'] \
        + "_" + df_modres_ref['ptm_accession'] + "_" + df_modres_ref['position'].astype(str)

In [38]:
df_modres_ref.head(20)

Unnamed: 0,gene,name,organism,uniprotId,position,description,evidence,pubmedId,journal,title,date,Identifier,ptm_accession,mod_res_id
1,ACTG1,"Actin, cytoplasmic 2",Homo sapiens,P63261,1,N-acetylmethionine,17 25 26,25944712,Proteomics,,2015,n-acetylmethionine,PTM-0205,P63261_PTM-0205_1
2,ACTG1,"Actin, cytoplasmic 2",Homo sapiens,P63261,1,N-acetylmethionine,17 25 26,16773128,Eur. J. Hum. Genet.,,2006,n-acetylmethionine,PTM-0205,P63261_PTM-0205_1
3,ACTG1,"Actin, cytoplasmic 2",Homo sapiens,P63261,1,N-acetylmethionine,17 25 26,18804074,J. Genet. Genomics,,2008,n-acetylmethionine,PTM-0205,P63261_PTM-0205_1
4,ACTG1,"Actin, cytoplasmic 2",Homo sapiens,P63261,2,"N-acetylglutamate; in Actin, cytoplasmic 2, N-...",14 15 18 19 24 25 26 27,22223895,Mol. Cell. Proteomics,,2012,n-acetylglutamate,PTM-0202,P63261_PTM-0202_2
5,ACTG1,"Actin, cytoplasmic 2",Homo sapiens,P63261,2,"N-acetylglutamate; in Actin, cytoplasmic 2, N-...",14 15 18 19 24 25 26 27,22814378,Proc. Natl. Acad. Sci. U.S.A.,,2012,n-acetylglutamate,PTM-0202,P63261_PTM-0202_2
6,ACTG1,"Actin, cytoplasmic 2",Homo sapiens,P63261,2,"N-acetylglutamate; in Actin, cytoplasmic 2, N-...",14 15 18 19 24 25 26 27,26228148,Science,,2015,n-acetylglutamate,PTM-0202,P63261_PTM-0202_2
7,ACTG1,"Actin, cytoplasmic 2",Homo sapiens,P63261,2,"N-acetylglutamate; in Actin, cytoplasmic 2, N-...",14 15 18 19 24 25 26 27,30028079,FEBS J.,,2018,n-acetylglutamate,PTM-0202,P63261_PTM-0202_2
8,ACTG1,"Actin, cytoplasmic 2",Homo sapiens,P63261,2,"N-acetylglutamate; in Actin, cytoplasmic 2, N-...",14 15 18 19 24 25 26 27,14684684,J. Med. Genet.,,2003,n-acetylglutamate,PTM-0202,P63261_PTM-0202_2
9,ACTG1,"Actin, cytoplasmic 2",Homo sapiens,P63261,2,"N-acetylglutamate; in Actin, cytoplasmic 2, N-...",14 15 18 19 24 25 26 27,16773128,Eur. J. Hum. Genet.,,2006,n-acetylglutamate,PTM-0202,P63261_PTM-0202_2
10,ACTG1,"Actin, cytoplasmic 2",Homo sapiens,P63261,2,"N-acetylglutamate; in Actin, cytoplasmic 2, N-...",14 15 18 19 24 25 26 27,18804074,J. Genet. Genomics,,2008,n-acetylglutamate,PTM-0202,P63261_PTM-0202_2


In [23]:
# save the dataframe as csv file
#df_modres_ref.to_csv('kgdata/df_modres_ref.csv', index=False)

In [39]:
# save the dataframe as json file
df_modres_ref.to_json('data/kgdata/df_modres_ref.json', orient='records')

In [40]:
df_modres_ref.columns

Index(['gene', 'name', 'organism', 'uniprotId', 'position', 'description',
       'evidence', 'pubmedId', 'journal', 'title', 'date', 'Identifier',
       'ptm_accession', 'mod_res_id'],
      dtype='object')

# Rough

In [85]:
# explode the dictionaries into separate rows
#df_test = pd.read_csv('output/ptm_data.csv')
#df_test = df_test.explode('ptm')
#df_test.head(5)

In [86]:
# convert the dictionaries into separate columns
#df_test = pd.read_csv('output/ptm_data.csv')
#df_test[['Position', 'Description', 'Evidence']] = pd.json_normalize(df_test['ptm'])