### Uniprot PTM Vocabulary Parser

In [39]:
import gzip
import pandas as pd
import json as json
import os

### Prepare PTM Vocabulary

            ---------  ---------------------------     ------------------------------
            Line code  Content                         Occurrence in an entry
            ---------  ---------------------------     ------------------------------
            ID         Identifier (FT description)     Once; starts a PTM entry
            AC         Accession (PTM-xxxx)            Once
            FT         Feature key                     Once
            TG         Target                          Once; two targets separated by
                                                        a dash in case of intrachain
                                                        crosslinks
            PA         Position of the modification    Optional; once
                        on the amino acid
            PP         Position of the modification    Optional; once
                        in the polypeptide
            CF         Correction formula              Optional; once
            MM         Monoisotopic mass difference    Optional; once
            MA         Average mass difference         Optional; once
            LC         Cellular location               Optional; once; alternatives
                                                        can be proposed
            TR         Taxonomic range                 Optional; once or more
            KW         Keyword                         Optional; once or more
            DR         Cross-reference to external     Optional; once or more
                        databases
            //         Terminator                      Once; ends an entry


In [40]:
code2content = {
    "ID": "Identifier",
    "AC": "Accession",
    "FT": "Feature_key",
    "TG": "Target",
    "PA": "Position_amino_acid",
    "PP": "Position_polypeptide",
    "CF": "Correction_formula",
    "MM": "Monoisotopic_mass_difference",
    "MA": "Average_mass_difference",
    "LC": "Cellular_location",
    "TR": "Taxonomic_range",
    "KW": "Keyword",
    "DR": "Reference"
    }

#### Download data files from Uniprot

In [41]:
#!wget https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/docs/ptmlist.txt

In [42]:
# ID   (2-aminosuccinimidyl)acetic acid (Asn-Gly)
# AC   PTM-0450
# FT   CROSSLNK
# TG   Asparagine-Glycine.
# PA   Amino acid side chain-Amino acid backbone.
# PP   Anywhere-Protein core.
# CF   H-3 N-1
# MM   -17.026549
# MA   -17.03
# LC   Extracellular and lumenal localisation.
# TR   Archaea; taxId:2157 (Archaea).
# TR   Bacteria; taxId:2 (Bacteria).
# TR   Eukaryota; taxId:2759 (Eukaryota).
# DR   RESID; AA0441.
# DR   PSI-MOD; MOD:01624.
def parse_text_to_dict(filepath):
    data = {}
    with open(filepath, 'r') as f:
        lines = f.readlines()
        for line in lines:
            if line.startswith('//'):
                entry = {}
            elif line.startswith('ID'):
                entry[code2content['ID'] ] = line.split()[1].lower()
            elif line.startswith('AC'):
                accessions = line.split()[1]
                entry[code2content['AC'] ] = line.split()[1]
            elif line.startswith('FT'):
                entry[code2content['FT'] ] = line.split()[1]
            elif line.startswith('TG'):
                entry[code2content['TG'] ] = line.split()[1]
            elif line.startswith('PA'):
                entry[code2content['PA'] ] = line.split()[1]
            elif line.startswith('PP'):
                entry[code2content['PP'] ] = line.split()[1]
            elif line.startswith('CF'):
                entry[code2content['CF'] ] = line.split()[1]
            elif line.startswith('MM'):
                entry[code2content['MM'] ] = line.split()[1]
            elif line.startswith('MA'):
                entry[code2content['MA'] ] = line.split()[1]
            elif line.startswith('LC'):
                entry[code2content['LC'] ] = line.split()[1]
            elif line.startswith('TR'):
                entry[code2content['TR'] ] = line.split()[1]
            elif line.startswith('KW'):
                entry[code2content['KW'] ] = line.split()[1]
            elif line.startswith('DR'):
                entry[code2content['DR'] ] = line.split()[1]
                data.update({accessions:entry})
    return data


In [43]:
data = parse_text_to_dict('ptm/ptmlist.txt')

In [44]:
DATA = []
idf2accession = {}
for key, value in data.items():
    idf2accession[value['Identifier']] = value['Accession'] 
    DATA.append(value)

In [45]:
with open('ptm/idf2accession.json', 'w') as f:
    json.dump(idf2accession, f)

In [46]:
# To write the data to a JSON file
with open('ptm/ptm-vocab.json', 'w') as json_file:
    json.dump(DATA, json_file)

In [47]:
df_ptm_vocab = pd.DataFrame(DATA)
df_ptm_vocab.head(2)

Unnamed: 0,Identifier,Accession,Feature_key,Target,Position_amino_acid,Position_polypeptide,Correction_formula,Monoisotopic_mass_difference,Average_mass_difference,Cellular_location,Taxonomic_range,Reference,Keyword
0,(2-aminosuccinimidyl)acetic,PTM-0450,CROSSLNK,Asparagine-Glycine.,Amino,Anywhere-Protein,H-3,-17.026549,-17.03,Extracellular,Eukaryota;,PSI-MOD;,
1,(2-aminosuccinimidyl)acetic,PTM-0312,CROSSLNK,Aspartate-Glycine.,Amino,Anywhere-Protein,H-2,-18.010565,-18.02,Extracellular,Eukaryota;,PSI-MOD;,


In [48]:
df_ptm_vocab.to_csv('ptm/ptm-vocab.csv', index=False)
df_ptm_vocab.to_csv('kgdata/ptm-vocab.csv', index=False)

In [49]:
df_ptm_vocab.columns

Index(['Identifier', 'Accession', 'Feature_key', 'Target',
       'Position_amino_acid', 'Position_polypeptide', 'Correction_formula',
       'Monoisotopic_mass_difference', 'Average_mass_difference',
       'Cellular_location', 'Taxonomic_range', 'Reference', 'Keyword'],
      dtype='object')