## Data Exploration

In [23]:
import pandas as pd

In [24]:
df = pd.read_csv("combined.csv")

### Extract Name and Synonyms of Uniprot IDs

In [25]:
uids = df['Entry']

In [26]:
"""
Queries Uniprot database and retrieves protein data.
"""
import urllib.request
from urllib.error import HTTPError
from lxml import etree
import lxml.etree
import lxml.html
import json
import xmlschema
import lxml
import requests
from bs4 import BeautifulSoup
import html

In [27]:
format = '.xml'
#The URL with the protein.xml at the end
url = 'https://www.uniprot.org/uniprot/'
#Makes a schema out of the XML structure
schema = xmlschema.XMLSchema('https://www.uniprot.org/docs/uniprot.xsd') 

DATA = []
LOG = []
for ID in uids:
    names = []
    #print(ID)
    ID = str(ID)
    data = {"ID":ID,"names":[]}
    
    search_url = url+ID+format   
    
    try:   
        with urllib.request.urlopen(search_url) as r:                     
            #Reads the data from the URL for the particular protein
            raw_data = r.read().strip()

            tree = etree.fromstring(raw_data)                          
            #Makes a tree out of the protein's raw data from UniProt


            entry_dict = schema.to_dict(tree)       
            #Makes a dictionary out of the XML schema. (What's the 'tree' argument?)
            content = entry_dict['entry'][0]        
            #The first entry of the dictionary is assigned a variable. It has the xml structure of the protein info
            names = content['protein']                      
            #The information about the protein and it's other info is displayed here and below
    except:
        print(ID, "not found!")
        
    if names:
        
        #recomended name---------------------------
        try:
            rName = names['recommendedName']['fullName']
            
            if type(rName)==dict:
                try:
                    data['names'].append(rName['$'])
                except:
                    print("$ rname not found!")
            else:
                try: 
                    data['names'].append(rName)
                except:
                    print("$ rname not found!")
        except:
            LOG.append({ID: "no  reco full name"})
        
        try:
            sName = names['recommendedName']['shortName']
            
            if type(sName)==list:
                try:
                    data['names'].append(sName[0]['$'])
                except:
                    for st in sName:
                        data['names'].append(st)
            else:
                try:
                    data['names'].append(sName)
                except:
                    print("$ sname not found!")
        except:
            LOG.append({ID:"no Short Name"})
            
        #EC number ------------------------------------------    
        try:
            ecnum = names['recommendedName']['ecNumber']
            if type(ecnum) == list:
                for et in ecnum:
                    try:
                        data['names'].append("EC " + et['$'])
                    except:
                        data['names'].append("EC " + et)
        except:
            LOG.append({ID:"no EC num"})
            
            
        #alternative names---------------------------------------    
        try:
            aNames_list = names['alternativeName']
        except:
            LOG.append(ID)
            
        if aNames_list:
            for item in aNames_list:
                try:
                    afName = item["fullName"]
                    if type(afName)==dict:
                        try: 
                            data['names'].append(afName['$'])
                        except:
                            print("alt fullname not found")
                    else: 
                        data['names'].append(afName)
                except:
                    LOG.append({ID:"no alt full name"})

                try:
                    asName = item["shortName"]
                    if type(asName)==list:
                        try: 
                            data['names'].append(asName[0]['$'])
                        except:
                            for st in asName:
                                data['names'].append(st)
                    else:
                        data['names'].append(asName)
                except:
                    LOG.append({ID: "no reco short name"})
    DATA.append(data)
    #print(names)
    #print("=========================================")
    #print(data)
    #print("-------------------------------------")

In [28]:
with open("proteins.json",'w') as f:
    json.dump(DATA,f)

In [29]:
DATA[0]

{'ID': 'P17302',
 'names': ['Gap junction alpha-1 protein',
  'Connexin-43',
  'Cx43',
  'Gap junction 43 kDa heart protein']}

In [32]:
file = open("entities.txt","w")

In [33]:
for item in DATA:
    name = item["ID"]
    names = item['names']
    for n in names:
        nn = str(n)
        nn = nn.lower()
        nn = nn.replace(" ","_")
        name = name + "|" + nn
        
    file.write(name)   
    file.write("\n")
    print(item)
    print(name)
    print("-------------------------")

{'ID': 'P17302', 'names': ['Gap junction alpha-1 protein', 'Connexin-43', 'Cx43', 'Gap junction 43 kDa heart protein']}
P17302|gap_junction_alpha-1_protein|connexin-43|cx43|gap_junction_43_kda_heart_protein
-------------------------
{'ID': 'Q13936', 'names': ['Voltage-dependent L-type calcium channel subunit alpha-1C', 'Calcium channel, L type, alpha-1 polypeptide, isoform 1, cardiac muscle', 'Voltage-gated calcium channel subunit alpha Cav1.2']}
Q13936|voltage-dependent_l-type_calcium_channel_subunit_alpha-1c|calcium_channel,_l_type,_alpha-1_polypeptide,_isoform_1,_cardiac_muscle|voltage-gated_calcium_channel_subunit_alpha_cav1.2
-------------------------
{'ID': 'O95180', 'names': ['Voltage-dependent T-type calcium channel subunit alpha-1H', 'Low-voltage-activated calcium channel alpha1 3.2 subunit', 'Voltage-gated calcium channel subunit alpha Cav3.2']}
O95180|voltage-dependent_t-type_calcium_channel_subunit_alpha-1h|low-voltage-activated_calcium_channel_alpha1_3.2_subunit|voltage-ga