# importing data in xml format

In [None]:
import gzip
import pandas as pd
import xml.etree.ElementTree as ET

from parser.single_xml_parser import UniprotEntryParser

input = gzip.open('whole uniprot.xml.gz', 'r')
 #tree = ET.parse('P05141.xml')
tree = ET.parse(input)
root = tree.getroot()

print (root.tag)
#print (root.attrib)

In [None]:
#for i, entry in enumerate(root.findall('{http://uniprot.org/uniprot}entry')):
#    UEP = UniprotEntryParser(entry)
#    #print (i, UEP.get_references())

In [None]:
def get_entry_dict(entry):
    # Get the entry dictionary
    UEP = UniprotEntryParser(entry)
    

    entry_dict = {'accession': UEP.get_accession(),\
                   'name': UEP.get_name(),\
                    'gene': UEP.get_gene(),\
                    'organism': UEP.get_organism(),\
                    'sequence': UEP.get_sequence(),\
                    'uniprotId': UEP.get_uniprotId(),\
                    'ptm': UEP.get_ptm(),\
                    'references': UEP.get_references()}
    
    return entry_dict

In [None]:
entries = [entry for entry in root.findall('{http://uniprot.org/uniprot}entry')]
entry_dicts = [get_entry_dict(entry) for entry in entries]

In [None]:
#entry_dicts[0]

In [None]:
#for item in entry_dicts:
#    print(item['references'])

### Create Mapper

In [None]:
key2pmid = {}
for entry in entry_dicts:
    for reference in entry['references']:
        if reference['citation_type'] == 'journal article':
            #key2pmid[reference['key']] = reference['pubmedId']
            key2pmid[reference['key']] = reference

        elif reference['citation_type'] == 'submission':
            #key2pmid[reference['key']] = reference['db']
            key2pmid[reference['key']] = reference

In [None]:
key2pmid

In [None]:
def splitAndMap(longstring):
    mapped = []
    nlist = longstring.split(' ')
    for item in nlist:
        mapped.append(key2pmid[item])
    return mapped
    

In [None]:
DATA = []
for entry in entry_dicts:
    idata = {}
    idata["accession"] = entry['accession']
    idata["entry"] = entry['name']
    idata["gene"] = entry['gene']
    idata["organism"] = entry['organism']
    idata["sequence"] = entry['sequence']
    idata["uniprotId"] = entry['uniprotId']
    for item in entry['ptm']:
        data = idata.copy()
        data["Position"] = item['Position']
        data["Description"] = item['Description']
        try:
            data["Evidence"] = splitAndMap(item['Evidence'])
        except:
            data["Evidence"] = None
        DATA.append(data)

In [None]:
DATA[0]

In [None]:
df = pd.DataFrame(DATA)
df.head()

In [None]:
#df.columns
df.to_csv('alldata.csv', index=False)

In [None]:
sdf = df[['uniprotId', 'entry', 'gene', 'organism', 'Position', 'Description','Evidence', 'sequence']]

In [None]:
sdf.to_csv('Selected_data.csv', index=False)

### ROUGH Code

In [None]:
df['ptm-position'] = df['ptm'].apply(lambda x: [item['Position'] for item in x] if x is not None else None)
df['ptm-description'] = df['ptm'].apply(lambda x: [item['Description'] for item in x] if x is not None else None)

In [None]:
from bs4 import BeautifulSoup

# Open the XML file
with open('P12235.xml', 'r') as f:
    data = f.read()

# Parse the XML file
soup = BeautifulSoup(data, 'xml')

# Find all modified residues
modified_residues = soup.find_all('feature', {'type': 'modified residue'})

# Get a dictionary of all evidences with their associated references
evidences = {evidence.get('key'): evidence for evidence in soup.find_all('evidence')}

# Initialize an empty list to store the data
data = []

# Iterate over all modified residues
for residue in modified_residues:
    position = residue.location.begin.get('position')
    description = residue.get('description')

    # Get the evidence IDs for this residue
    evidence_ids = residue.get('evidence').split()

    # Get the corresponding references for each evidence
    references = []
    for evidence_id in evidence_ids:
        evidence = evidences.get(evidence_id)
        if evidence:
            reference_key = evidence.get('key')
            references.append(reference_key)
    
    data.append({
        'Position': position, 
        'Description': description, 
        'Evidence': references
    })

# Now, 'data' is a list of dictionaries, each containing the position, description, and evidence for a modified residue


In [None]:
import xml.etree.ElementTree as ET
input = gzip.open('all ptm types_AAC.xml.gz', 'r')
tree = ET.parse(input)
root = tree.getroot()

ns = {'uniprot': 'http://uniprot.org/uniprot'}  # Namespace

# Iterate through each entry
for entry in root.findall('uniprot:entry', ns):
    ptm_evidence_map = {}  # Mapping of PTMs to their references

    # Get all references in the entry
    references = {ref.get('key'): ref for ref in entry.findall('uniprot:reference', ns)}

    # Get all evidences in the entry
    evidences = {evidence.get('key'): evidence for evidence in entry.findall('uniprot:evidence', ns)} 

    # Get all PTMs in the entry
    ptms = [feature for feature in entry.findall('uniprot:feature', ns) if feature.get('type') == 'modified residue']

    # For each PTM, find its evidence and then its reference
    for ptm in ptms:
        evidence_key = ptm.get('evidence')
        if evidence_key:
            evidence = evidences.get(evidence_key)
            if evidence:
                reference_key = evidence.get('source')
                if reference_key:
                    reference = references.get(reference_key)
                    if reference:
                        # Add to the mapping
                        ptm_description = ptm.get('description')
                        ptm_evidence_map[ptm_description] = reference
                        

    # Print the mapping
    for ptm, reference in ptm_evidence_map.items():
        print(f'PTM: {ptm}')
        print(f'Reference: {ET.tostring(reference, encoding="unicode")}')


In [None]:
def get_references(self):
    self.references = {}
    for ref in self.entry.findall(self.ns+'reference'):
        key = ref.get('key')
        pubmed_id = ref.find(".//{%s}dbReference[@type='PubMed']" % self.ns)
        if pubmed_id is not None:
            self.references[key] = pubmed_id.get('id')

def get_evidences(self):
    self.ptms = []
    for ptm in self.entry.findall(".//{%s}feature[@type='modified residue']" % self.ns):
        description = ptm.find("{%s}description" % self.ns).text
        position = ptm.find(".//{%s}position" % self.ns).get('position')
        evidence = ptm.get('evidence').split(' ')
        pubmed_ids = [self.references.get(e) for e in evidence]
        self.ptms.append({'description': description, 'position': position, 'pubmed_ids': pubmed_ids})

def parse_entry(self, entry):
    self.entry = entry
    self.get_references()
    self.get_evidences()
    return self.ptms


In [None]:

references = []
ns = {'uniprot': 'http://uniprot.org/uniprot'}
for i, entry in enumerate(root.findall('{http://uniprot.org/uniprot}entry')):
        print(i,"-----------------------------------")
        for ref in entry.findall('uniprot:reference', ns):
                        reference = {}
                        key = ref.get('key')
                        reference['key'] = key
                        citation_type = ref.find('uniprot:citation', ns).get('type')
                        reference["citation_type"] = citation_type

                        #print(i, key, citation_type, ref)
                        if citation_type == 'journal article':
                                try:
                                        reference['journal'] = ref.find('uniprot:citation', ns).get('name')
                                except:
                                        reference['journal'] = None
                                try:
                                        reference['date'] = ref.find('uniprot:citation', ns).get('date')
                                except:
                                        reference['date'] = None
                                try:
                                        reference['title'] = ref.find('uniprot:citation/uniprot:title', ns).text
                                except:
                                        reference['title'] = None
                                try:
                                        reference['authors'] = [author.get('name') for author in ref.findall('uniprot:citation/uniprot:authorList/uniprot:person', ns)]
                                except:
                                        reference['authors'] = None
                                try:
                                        reference['pubmedId'] = ref.find('uniprot:citation/uniprot:dbReference[@type="PubMed"]', ns).get('id')
                                except:
                                        reference['pubmedId'] = None
                                try:
                                        reference['doi'] = ref.find('uniprot:citation/uniprot:dbReference[@type="DOI"]', ns).get('id')
                                except:
                                        reference['doi'] = None
                        if citation_type == 'submission':  
                                try: 
                                        reference['db'] = ref.find('uniprot:citation', ns).get('db')
                                except:
                                        reference['db'] = None
                                try:
                                        reference['date'] = ref.find('uniprot:citation', ns).get('date')
                                except:
                                        reference['date'] = None
                                try:
                                        reference['scope'] = [scope.text for scope in ref.findall('uniprot:scope', ns)]
                                except:
                                        reference['scope'] = None
                                try:
                                        reference['source'] = [source.text for source in ref.findall('uniprot:source', ns)]
                                except:
                                        reference['source'] = None

                        references.append(reference)
                        print(reference)
                        print("---------------------------------------------------")
       
                     

#print(references )           