### Bulk Parsing and Saving

In [11]:
import gzip
import json
import pandas as pd
import xml.etree.ElementTree as ET
import os
from parser.single_xml_parser import UniprotEntryParser
from parser.splited_xml_parser import SplitedEntryParser

### Preparing sample function

In [13]:
def parse_file(file_path): 
    tree = ET.parse(file_path)
    root = tree.getroot()

    SEP = SplitedEntryParser(root)

    entry_dict = {'accession': SEP.get_accession(),\
                   'name': SEP.get_name(),\
                    'gene': SEP.get_gene(),\
                    'organism': SEP.get_organism(),\
                    'sequence': SEP.get_sequence(),\
                    'uniprotId': SEP.get_uniprotId(),\
                    'ptm': SEP.get_ptm(),\
                    'references': SEP.get_references()}
        
    return entry_dict

In [14]:
parse_file('xmldataset/entry_546602.xml')

{'accession': ['B9IVW8'],
 'name': 'Bifunctional protein PyrR',
 'gene': 'pyrR',
 'organism': 'Bacillus cereus (strain Q1)',
 'sequence': 'MQEKAVVLDDQMIRRALTRISHEIVERNKGVDNCVLVGIKTRGIFIAQRLAERIGQIEGKEMEVGELDITLYRDDLTLQSKNKEPLVKGSDIPVDITKKKVILVDDVLYTGRTVRAAMDALMDLGRPSQIQLAVLVDRGHRELPIRADYVGKNIPTSSEERIEVDLQETDQQDRVSIYDK',
 'uniprotId': 'B9IVW8',
 'ptm': [],
 'references': [{'key': '1',
   'citation_type': 'journal article',
   'journal': 'J. Bacteriol.',
   'date': '2009',
   'title': None,
   'authors': ['Xiong Z.',
    'Jiang Y.',
    'Qi D.',
    'Lu H.',
    'Yang F.',
    'Yang J.',
    'Chen L.',
    'Sun L.',
    'Xu X.',
    'Xue Y.',
    'Zhu Y.',
    'Jin Q.'],
   'pubmedId': '19060151',
   'doi': '10.1128/jb.01629-08'}]}

### Counting files in the folder and getting the file paths

In [9]:
file_dir = 'xmldataset'
os.listdir(file_dir)[0]

'entry_546602.xml'

In [10]:
nfiles = 10
file_paths = [os.path.join(file_dir, file) for file in os.listdir(file_dir)[0:nfiles] if file.endswith('.xml')]
file_paths[0]

'xmldataset/entry_546602.xml'

---------------------------------------------------------------------------

### Parallelize the parsing of xml files
Run file from terminal: ```python3 parallel.py```

Parsed data will be dumped as ```all_uniprot_data.json```

---------------------------------------------------------------------------

### Mapping the Reference into PTM Evidences

In [None]:
with open("output/all_uniprot_data.json", "r") as f:
    uniprot_data = json.load(f) 

In [None]:
for item in uniprot_data:
    key2reference = {}
    for reference in item['references']:
        key2reference[reference['key']] = reference

with open('kgdata/key2reference.json', 'w') as f:
    json.dump(key2reference, f)