### Splitting Large XML into entry based single files

We split large XML file into individual pieces associated with a specific protein. Later we parallelize the parsing step using ```parallel.py```. The main parsing library is in ```parsing folder```

In [None]:
import gzip
import pandas as pd
import xml.etree.ElementTree as ET
import os
import json as json

from tqdm import tqdm
from parser.splited_xml_parser import SplitedEntryParser

In [None]:
def get_entry_dict(entry):
    # Get the entry dictionary
    UEP = SplitedEntryParser(entry)
    

    entry_dict = {'accession': UEP.get_accession(),\
                   'name': UEP.get_name(),\
                    'gene': UEP.get_gene(),\
                    'organism': UEP.get_organism(),\
                    'sequence': UEP.get_sequence(),\
                    'uniprotId': UEP.get_uniprotId(),\
                    'ptm': UEP.get_ptm(),\
                    'references': UEP.get_references()}
    
    return entry_dict

In [3]:
def split_xml(filename, split=True, parse=False):
    context = ET.iterparse(filename, events=("start", "end"))
    event, root = next(context)  # Get root element of the XML file

    entry_count = 0
    for event, elem in context:
        if event == "end" and elem.tag == "{http://uniprot.org/uniprot}entry":  
            entry_count += 1

            if split==True:
                # Write each entry to a separate XML file
                with open(f"xmldataset/entry_{entry_count}.xml", "wb") as f:  
                    f.write(ET.tostring(elem, encoding="utf-8"))


            # set parse == False if planing to parallelize
            if parse==True:
                # Write each entry to a separate JSON file
                data = get_entry_dict(elem)  # Get a dictionary of the entry data
                with open(f"jsondataset/entry_{entry_count}.json", "w") as f:  
                    json.dump(data, f)

            # print the progress
            if entry_count % 1000 == 0:
                print(f"{entry_count} entries processed")
            
            # print tqdm progressbar
            # tqdm.write(f"{entry_count} entries processed")

            # Clear the element to free memory
            root.clear()  # Discard the element data to free memory

In [None]:
split_xml("largedata/whole uniprot.xml", split=True, parse=False)