### Testing Splitted XML data and parsing library

In [1]:
import gzip
import pandas as pd
import xml.etree.ElementTree as ET
import os
from parser.single_xml_parser import UniprotEntryParser
from parser.splited_xml_parser import SplitedEntryParser

#### Testing a single splitted file

In [4]:
tree = ET.parse('xmldataset/entry_5.xml')
root = tree.getroot()
print (root.tag)
print (root.attrib)

{http://uniprot.org/uniprot}entry
{'dataset': 'Swiss-Prot', 'created': '2009-05-05', 'modified': '2022-02-23', 'version': '52'}


In [5]:
# Set the namespace
ns = {'ns0': 'http://uniprot.org/uniprot'}

# Search for the "accession" element within the "entry" element
for entry in root.findall('ns0:accession', ns):
    ID = entry.text
    print(ID)

Q65209


In [6]:
def get_accession(root):
    # Get the accession number
    ns = {'ns0': 'http://uniprot.org/uniprot'}
    accessions =  [accession.text for accession in root.findall('ns0:accession', ns)]  
    return accessions

In [7]:
get_accession(root)

['Q65209']

In [8]:
def get_entry_dict(entry):
    # Get the entry dictionary
    UEP = SplitedEntryParser(entry)
    entry_dict = {'accession': UEP.get_accession(),\
                   'name': UEP.get_name(),\
                    'gene': UEP.get_gene(),\
                    'organism': UEP.get_organism(),\
                    'sequence': UEP.get_sequence(),\
                    'uniprotId': UEP.get_uniprotId(),\
                    'ptm': UEP.get_ptm(),\
                    'references': UEP.get_references()}
    
    return entry_dict

In [9]:
get_entry_dict(root)

{'accession': ['Q65209'],
 'name': 'Protein MGF 100-2L',
 'gene': None,
 'organism': 'African swine fever virus (strain Badajoz 1971 Vero-adapted)',
 'sequence': 'MGNKESKYLEMCSEEAWLNIPNIFKCIFIRKLFYNKWLKYQEKKLKKSLKLLSFYHPKKDFVGIRDMLHMAPGGSYFITDNITEEFLMLVVKHPEDGSAEFTKLCLKGSCIVIDGYYYDTLHIFLSETPDIYKYPLIRYDR',
 'uniprotId': 'Q65209',
 'ptm': [],
 'references': [{'key': '1',
   'citation_type': 'journal article',
   'journal': 'J. Virol.',
   'date': '1990',
   'title': None,
   'authors': ['Gonzalez A.',
    'Calvo V.',
    'Almazan F.',
    'Almendral J.M.',
    'Ramirez J.C.',
    'de la Vega I.',
    'Blasco R.',
    'Vinuela E.'],
   'pubmedId': '2325203',
   'doi': '10.1128/jvi.64.5.2073-2081.1990'},
  {'key': '2',
   'citation_type': 'journal article',
   'journal': 'Virology',
   'date': '1995',
   'title': None,
   'authors': ['Yanez R.J.',
    'Rodriguez J.M.',
    'Nogal M.L.',
    'Yuste L.',
    'Enriquez C.',
    'Rodriguez J.F.',
    'Vinuela E.'],
   'pubmedId': '11831707',
 