### BIO727P - Bioinformatics Software Development Group Project (2019/20)


#### AIM: To retrive information about human protein kinases from various databases and compile into one table

In [1]:
# Version: Python 2.7.16

# import the required packages

import pandas as pd # import pandas
import re # import regular expression
import urllib # import url library

__Step 1:__ Compile a list of human protein kinases using UniProt

In [2]:
url="https://www.uniprot.org/docs/pkinfam.txt" # retrieve webpage of human kinases from UniProt
webpage=urllib.urlopen(url) # open the URL
myfile=webpage.read() # read the URL contents

matches=re.findall(r"([A-Z0-9]+)_HUMAN", myfile) # create a regular expression to find all human kinases and extract all to a list

human_kinases=[] # open an empty list to store UniProt identifiers for all human kinases

for x in range(len(matches)):
    human_kinases.append(matches[x]+"_HUMAN") # append identifiers to new list, re-adding the "_HUMAN" found on all identifiers
    
df=pd.DataFrame({'Entry_name' : human_kinases}) # create a new table with the header entry name consisting of the human_kinases list

__Step 2:__ Extract following kinase information from UniProt:

__Step 2a:__ UniProt Accession number

In [4]:
uniprot_id=[]

for u in range(len(human_kinases)):
    id_url="https://www.uniprot.org/uniprot/?query="+human_kinases[u]+"&columns=id&format=tab" # retrieve webpage
    id_webpage=urllib.urlopen(id_url) # open the URL
    id_file=id_webpage.read() # read thr URL contents 
    id_list=id_file.split("\n") # seperate sequence name from header
    uniprot_id.append(id_list[1]) # append accession numbers to an empty list
    
df['UniProt_ID']=uniprot_id # add the uniprot id list as a column to the dataframe

__Step 2b:__ Protein names

In [5]:
primary_protein_name=[] # open two empty lists for the two different types of names we see 
alternative_protein_names=[]

for x in range(len(human_kinases)):
    protein_url="https://www.uniprot.org/uniprot/?query="+human_kinases[x]+"&columns=protein%20names&format=tab" # retrieve webpage
    protein_webpage=urllib.urlopen(protein_url) # open the URL
    protein_file=protein_webpage.read() # read thr URL contents 
    protein_list=protein_file.split("\n") # seperate protein name from header
    pattern1=re.compile(r"\(EC [\d]+\.[\d]+\.[\d]+\.[\d]+\)") # regular expression for EC numbers
    prot_names1=pattern1.sub("", (str(protein_list[1]))) # remove the EC number from the string
    pattern2=re.compile(r"\(EC [\d]+\.[\d]+\.([\d]+|\-).\-\)")
    prot_names2=pattern2.sub("", prot_names1)
    prot_names3=prot_names2.replace(") ", "").replace(")", "").replace("  ", "") # replace brackets and spaces (formatting)
    prot_names4=prot_names3.split("(") # split by open bracket
    primary_protein_name.append(prot_names4[0]) # append the first element (the primary name)
    prot_names4.pop(0) # remove the first element as it has already been appended
    alternative_protein_names.append(prot_names4) # append the rest of the alternate names to another list

df['Primary_Protein_Name']=primary_protein_name # add the primary names list as a column to the dataframe
df['Alternate_Protein_Name(s)']=alternative_protein_names # add the alternate names list as a column to the dataframe

__Step 2c:__ Gene symbols

In [7]:
# primary gene symbol

primary_gene_name=[]

for p in range(len(human_kinases)):
    gene_url="https://www.uniprot.org/uniprot/?query="+human_kinases[p]+"&columns=genes(PREFERRED)&format=tab" # retrieve webpage
    gene_webpage=urllib.urlopen(gene_url) # open the URL
    gene_file=gene_webpage.read() # read thr URL contents 
    gene_list=gene_file.split("\n") # seperate gene name from header
    primary_gene_name.append(gene_list[1]) # append gene name to the new list

# alternate gene symbol(s)

alternative_gene_names=[]

for a in range(len(human_kinases)):
    gene_alt_url="https://www.uniprot.org/uniprot/?query="+human_kinases[a]+"&columns=genes(ALTERNATIVE)&format=tab" # retrieve webpage
    gene_alt_webpage=urllib.urlopen(gene_alt_url) # open the URL
    gene_alt_file=gene_alt_webpage.read() # read thr URL contents 
    gene_alt_list=gene_alt_file.split("\n") # seperate gene name from header
    gene_alt_names=(str(gene_alt_list[1])).split(" ") # split by space
    alternative_gene_names.append(gene_alt_names) # append gene names to the new list

df['Gene_Symbol']=primary_gene_name # add the primary names list as a column to the dataframe
df['Alternative_Gene_Name(s)']=alternative_gene_names # add the alternate names list as a column to the dataframe

__Step 2d:__ Families

In [8]:
family=[]

for f in range(len(human_kinases)):
    fam_url="https://www.uniprot.org/uniprot/?query="+human_kinases[f]+"&columns=families&format=tab" # retrieve webpage
    fam_webpage=urllib.urlopen(fam_url) # open the URL
    fam_file=fam_webpage.read() # read thr URL contents 
    fam_list=fam_file.split("\n") # seperate family name from header
    family.append(fam_list[1]) # append family names to a new list

df['Families']=family # add the families names list as a column to the dataframe

__Step 2e:__ Sequence

In [9]:
sequence=[]

for s in range(len(human_kinases)):
    seq_url="https://www.uniprot.org/uniprot/?query="+human_kinases[s]+"&columns=sequence&format=tab" # retrieve webpage
    seq_webpage=urllib.urlopen(seq_url) # open the URL
    seq_file=seq_webpage.read() # read thr URL contents 
    seq_list=seq_file.split("\n") # seperate sequence name from header
    sequence.append(seq_list[1]) # append sequences to a new list

df['AA_Seq']=sequence # add the AA seq list as a column to the dataframe

__Step 2f:__ Molecular Mass

In [10]:
molecular_mass=[]

for m in range(len(human_kinases)):
    mass_url="https://www.uniprot.org/uniprot/?query="+human_kinases[m]+"&columns=mass&format=tab" # retrieve webpage
    mass_webpage=urllib.urlopen(mass_url) # open the URL
    mass_file=mass_webpage.read() # read thr URL contents 
    mass_list=mass_file.split("\n") # seperate sequence name from header
    molecular_mass.append(mass_list[1]) # append masses to a new list
    
df['Molecular_Mass_(Da)']=molecular_mass # add the mass list as a column to the dataframe

__Step 2g:__ Subcellular location

In [11]:
subcellular_location=[]

for l in range(len(human_kinases)):
    cell_url="https://www.uniprot.org/uniprot/?query="+human_kinases[l]+"&columns=comment(SUBCELLULAR%20LOCATION)&format=tab" # retrieve webpage
    cell_webpage=urllib.urlopen(cell_url) # open the URL
    cell_file=cell_webpage.read() # read thr URL contents 
    cell_list=cell_file.split("\n") # seperate sequence name from header
    cell_locations=str(cell_list[1]) # turn the locations to a string 
    pattern3=re.compile(r"[A-Za-z0-9]+:[A-Za-z0-9]+\|[A-Za-z0-9]+:[A-Za-z0-9]+(\}|\,)") # regex to remove pubmed IDs etc
    cellular_locations1=pattern3.sub("", cell_locations) # replace with nothing
    pattern4=re.compile(r"Note=([A-Za-z0-9].+)") # regex to remove notes 
    cellular_locations2=pattern4.sub("", cellular_locations1) # replace with nothing
    pattern5=re.compile(r"{[A-Za-z0-9]+:[A-Za-z0-9]+") # regex to remove other IDs 
    cellular_locations3=pattern5.sub("", cellular_locations2) # replace with nothing
    cellular_locations4=cellular_locations3.replace(" {.", ",").replace(" {;", ",").replace(" }.", ",").replace(" }", ",").replace(" ;", ""). replace(" { .", ",").replace(";", "").replace("SUBCELLULAR LOCATION: ", "").replace(".", ",")
        # replace all punctuation with nothing OR commas
    if len(cellular_locations4)==0:
        cellular_locations4+="NA" # if field is empty, add NA
    cellular_locations4+="." # add full stop to end of all fields, for the sake of formatting in excel later on
    subcellular_location.append(cellular_locations4) # append string of locations to empty list
    
df['Subcellular_Location']=subcellular_location # add the subcellular location list as a column to the dataframe

__Step 3:__ Export the pandas dataframe to a .csv type file

In [None]:
df.to_csv("human_kinase_dataframe.csv", index=False) # export pandas table to .csv and exclude indexing values as a column