# Phosphosite Data Processing, 1 of 2.
## Phospho.ELM Data.

A phosphosite data set has been downloaded as a tab-delimited file from http://phospho.elm.eu.org/dataset.html:

- phosphoELM_vertebrate_2015-04.dump
    - The full list of vertebrate phosphosites available on the website (46,248 rows long on 2020_01_17)

In this Jupyter notebook, it will be imported as a pandas dataframe, and will have the following changes made to it:

- Remove any rows with information about non-human proteins.
- Translate kinases using "human_kinase_dataframe.csv".
- Generate a phosphosite ID column.
- Remove unnecessary columns.

It will then be exported as "phosphosites_2.csv" for subsequent incorporation into data tables with data from phosphosite.org, in Jupyter notebook "Phosphosite-data-processing-2-of-2-Phosphosite.org-downloaded-files.ipynb".

Import required packages

In [None]:
import pandas as pd

Read in phosphosite data and table of human kinases, for translating kinase aliases

In [None]:
phosphosite_2_df = pd.read_table( "phosphoELM_vertebrate_2015-04.dump" )

kinases_df = pd.read_csv( "human_kinase_dataframe.csv" )

Remove any rows where species is not "Homo sapiens"

In [None]:
phosphosite_2_df = phosphosite_2_df.drop( phosphosite_2_df[ phosphosite_2_df.species != "Homo sapiens" ].index )

phosphosite_2_df = phosphosite_2_df.reset_index( drop = True )

Translate the kinase IDs to UniProt IDs, where possible

In [None]:
# Make a list of kinases from phosphosite_2_df
# Remove duplicates
# Convert them to uppercase

kinases = phosphosite_2_df.kinases

kinases = list( kinases.drop_duplicates() )

for n, i in enumerate( kinases ):
    kinases[ n ] = str( i ).upper()

In [None]:
# Make an empty dictionary for storing kinase aliases
# The keys will be kinase names from phosphosite_2_df
# The values will be the UniProt accession IDs from
# kinases_df

kinase_dict = {}

In [None]:
# Check five columns in kinases_df for the kinase IDs from phosphosite_2_df
# If kinase is found, store the corresponding UniProt ID in the dictionary
# and remove it from the list "kinases"

# For some reason the following five loops need to be executed three times
# in order to work correctly

# Run 1/3 of five loops:

# Check whether kinase plus "_HUMAN" matches the "Entry name"

for a in kinases:
    for o, j in enumerate( kinases_df[ 'Entry_name' ] ):
        if str( a ) + "_HUMAN" == str( j ):
            kinase_dict[ a ] = kinases_df.UniProt_ID[ o ]
            kinases.remove( a )

# Check whether the kinase can be found in "Primary Protein Name"           
            
for b in kinases:
    if b != "ABL": # Matches with the word "probable"
        for p, k in enumerate( kinases_df[ 'Primary_Protein_Name' ] ):
                if str( b ) in str( k ).upper():
                    if b in kinases:
                        kinase_dict[ b ] = kinases_df.UniProt_ID[ p ]
                        kinases.remove( b )

# Check whether the kinase can be found in "Alternative Protein Name(s)"
                        
for c in kinases:
    if c != "LOK": # Matches with the work "Telokin"
        for q, l in enumerate( kinases_df[ 'Alternative_Protein_Name(s)' ] ):
            if str( c ) in str( l ).upper():
                if c in kinases:
                    kinase_dict[ c ] = kinases_df.UniProt_ID[ q ]
                    kinases.remove( c )

# Check whether the kinase exactly matches the "Gene Symbol"
                    
for d in kinases:
    for r, m in enumerate( kinases_df[ 'Gene_Symbol' ] ):
        if str( d ) == str( m ).upper():
            if d in kinases:
                kinase_dict[ d ] = kinases_df.UniProt_ID[ r ]
                kinases.remove( d )

# Check whether the kinase can be found in "Alternative Gene Name(s)"
                
for e in kinases:
    for s, z in enumerate( kinases_df[ 'Alternative_Gene_Name(s)' ] ):
        if str( e ) in str( z ).upper():
            if e in kinases:
                kinase_dict[ e ] = kinases_df.UniProt_ID[ s ]
                kinases.remove( e )
                
# Re-running the five loops resolves a further 21 kinase IDs. 
# For example, the previous loops failed to match "ABL2" + "_HUMAN"
# with "ABL2_HUMAN" 

# Run 2/3 of five loops

# Check whether kinase plus "_HUMAN" matches the "Entry name"

for a in kinases:
    for o, j in enumerate( kinases_df[ 'Entry_name' ] ):
        if str( a ) + "_HUMAN" == str( j ):
            kinase_dict[ a ] = kinases_df.UniProt_ID[ o ]
            kinases.remove( a )

# Check whether the kinase can be found in "Primary Protein Name"           
            
for b in kinases:
    if b != "ABL": # Matches with the word "probable"
        for p, k in enumerate( kinases_df[ 'Primary_Protein_Name' ] ):
                if str( b ) in str( k ).upper():
                    if b in kinases:
                        kinase_dict[ b ] = kinases_df.UniProt_ID[ p ]
                        kinases.remove( b )

# Check whether the kinase can be found in "Alternative Protein Name(s)"
                        
for c in kinases:
    if c != "LOK": # Matches with the work "Telokin"
        for q, l in enumerate( kinases_df[ 'Alternative_Protein_Name(s)' ] ):
            if str( c ) in str( l ).upper():
                if c in kinases:
                    kinase_dict[ c ] = kinases_df.UniProt_ID[ q ]
                    kinases.remove( c )

# Check whether the kinase exactly matches the "Gene Symbol"
                    
for d in kinases:
    for r, m in enumerate( kinases_df[ 'Gene_Symbol' ] ):
        if str( d ) == str( m ).upper():
            if d in kinases:
                kinase_dict[ d ] = kinases_df.UniProt_ID[ r ]
                kinases.remove( d )

# Check whether the kinase can be found in "Alternative Gene Name(s)"
                
for e in kinases:
    for s, z in enumerate( kinases_df[ 'Alternative_Gene_Name(s)' ] ):
        if str( e ) in str( z ).upper():
            if e in kinases:
                kinase_dict[ e ] = kinases_df.UniProt_ID[ s ]
                kinases.remove( e )
                
# The second set of loops fails to pair "PDHK4" or "MAP2K7" with a UniProt ID. 
# Re-running the loops will add these kinases and their corresponding UniProt
# IDs to the dictionary    

# Run 3/3 of five loops

# Check whether kinase plus "_HUMAN" matches the "Entry name"

for a in kinases:
    for o, j in enumerate( kinases_df[ 'Entry_name' ] ):
        if str( a ) + "_HUMAN" == str( j ):
            kinase_dict[ a ] = kinases_df.UniProt_ID[ o ]
            kinases.remove( a )

# Check whether the kinase can be found in "Primary Protein Name"           
            
for b in kinases:
    if b != "ABL": # Matches with the word "probable"
        for p, k in enumerate( kinases_df[ 'Primary_Protein_Name' ] ):
                if str( b ) in str( k ).upper():
                    if b in kinases:
                        kinase_dict[ b ] = kinases_df.UniProt_ID[ p ]
                        kinases.remove( b )

# Check whether the kinase can be found in "Alternative Protein Name(s)"
                        
for c in kinases:
    if c != "LOK": # Matches with the work "Telokin"
        for q, l in enumerate( kinases_df[ 'Alternative_Protein_Name(s)' ] ):
            if str( c ) in str( l ).upper():
                if c in kinases:
                    kinase_dict[ c ] = kinases_df.UniProt_ID[ q ]
                    kinases.remove( c )

# Check whether the kinase exactly matches the "Gene Symbol"
                    
for d in kinases:
    for r, m in enumerate( kinases_df[ 'Gene_Symbol' ] ):
        if str( d ) == str( m ).upper():
            if d in kinases:
                kinase_dict[ d ] = kinases_df.UniProt_ID[ r ]
                kinases.remove( d )

# Check whether the kinase can be found in "Alternative Gene Name(s)"
                
for e in kinases:
    for s, z in enumerate( kinases_df[ 'Alternative_Gene_Name(s)' ] ):
        if str( e ) in str( z ).upper():
            if e in kinases:
                kinase_dict[ e ] = kinases_df.UniProt_ID[ s ]
                kinases.remove( e )

Several kinases are still not in the dictionary. Translate as many untranslated kinases as possible

In [None]:
# Make an empty list. We will split each untranslated kinase in two and 
# store the resulting list in this list, along with the original kinase
# name, which will be used in a dictionary later

kinases_2 = []

In [None]:
# Check for four non-alphanumeric characters in the untranslated kinases
# Split them by these characters if found, store in "kinases_2", and 
# remove from "kinases"
# Keep the original kinase name for use in a dictionary later

# For some reason the following four loops need to be executed four times
# in order to work correctly

# Run 1/4 of four loops:

for i in kinases:
    if " " in i:
        if i in kinases:
            items = i.split( " " )
            items.append(items[ 0 ] + " " + items[ 1 ])
            kinases_2.append( items )
            kinases.remove( i )
        
for j in kinases:
    if "/" in j:
        if j in kinases:
            items = j.split( "/" )
            items.append(items[ 0 ] + "/" + items[ 1 ])
            kinases_2.append( items )
            kinases.remove( j )
    
for k in kinases:
    if "_" in k:
        if k in kinases:
            items = k.split( "_" )
            items.append(items[ 0 ] + "_" + items[ 1 ])
            kinases_2.append( items )
            kinases.remove( k )

for l in kinases:
    if "-" in l:
        if l in kinases:
            items = l.split( "-" )
            items.append(items[ 0 ] + "-" + items[ 1 ])
            kinases_2.append( items )
            kinases.remove( l )

# Several kinases with underscores still remain in the original list

# Run 2/4 of four loops:

for i in kinases:
    if " " in i:
        if i in kinases:
            items = i.split( " " )
            items.append(items[ 0 ] + " " + items[ 1 ])
            kinases_2.append( items )
            kinases.remove( i )
        
for j in kinases:
    if "/" in j:
        if j in kinases:
            items = j.split( "/" )
            items.append(items[ 0 ] + "/" + items[ 1 ])
            kinases_2.append( items )
            kinases.remove( j )
    
for k in kinases:
    if "_" in k:
        if k in kinases:
            items = k.split( "_" )
            items.append(items[ 0 ] + "_" + items[ 1 ])
            kinases_2.append( items )
            kinases.remove( k )

for l in kinases:
    if "-" in l:
        if l in kinases:
            items = l.split( "-" )
            items.append(items[ 0 ] + "-" + items[ 1 ])
            kinases_2.append( items )
            kinases.remove( l )

# Nine splittable kinases remain in the original list

# Run 3/4 of four loops:

for i in kinases:
    if " " in i:
        if i in kinases:
            items = i.split( " " )
            items.append(items[ 0 ] + " " + items[ 1 ])
            kinases_2.append( items )
            kinases.remove( i )
        
for j in kinases:
    if "/" in j:
        if j in kinases:
            items = j.split( "/" )
            items.append(items[ 0 ] + "/" + items[ 1 ])
            kinases_2.append( items )
            kinases.remove( j )
    
for k in kinases:
    if "_" in k:
        if k in kinases:
            items = k.split( "_" )
            items.append(items[ 0 ] + "_" + items[ 1 ])
            kinases_2.append( items )
            kinases.remove( k )

for l in kinases:
    if "-" in l:
        if l in kinases:
            items = l.split( "-" )
            items.append(items[ 0 ] + "-" + items[ 1 ])
            kinases_2.append( items )
            kinases.remove( l )
    
# Three splittable kinases remain in the original list

# Run 4/4 of four loops:

for i in kinases:
    if " " in i:
        if i in kinases:
            items = i.split( " " )
            items.append(items[ 0 ] + " " + items[ 1 ])
            kinases_2.append( items )
            kinases.remove( i )
        
for j in kinases:
    if "/" in j:
        if j in kinases:
            items = j.split( "/" )
            items.append(items[ 0 ] + "/" + items[ 1 ])
            kinases_2.append( items )
            kinases.remove( j )
    
for k in kinases:
    if "_" in k:
        if k in kinases:
            items = k.split( "_" )
            items.append(items[ 0 ] + "_" + items[ 1 ])
            kinases_2.append( items )
            kinases.remove( k )

for l in kinases:
    if "-" in l:
        if l in kinases:
            items = l.split( "-" )
            items.append(items[ 0 ] + "-" + items[ 1 ])
            kinases_2.append( items )
            kinases.remove( l )

Check the kinases table for the strings in kinases_2

In [None]:
# Check whether the two strings from the kinase name
# can be found in "Primary Protein Name"  

# For some reason the following loop needs to be executed twice
# in order to work correctly

for i in kinases_2:
    for n, j in enumerate( kinases_df[ 'Primary_Protein_Name' ] ):
        if i[ 0 ] in j.upper() and i[ 1 ] in j.upper():
            if i in kinases_2:
                kinase_dict[ str( i[ 2 ] )] = kinases_df.UniProt_ID[ n ]
                kinases_2.remove( i )

for i in kinases_2:
    for n, j in enumerate( kinases_df[ 'Primary_Protein_Name' ] ):
        if i[ 0 ] in j.upper() and i[ 1 ] in j.upper():
            if i in kinases_2:
                kinase_dict[ str( i[ 2 ] )] = kinases_df.UniProt_ID[ n ]
                kinases_2.remove( i )

In [None]:
# Check whether the two strings from the kinase name
# can be found in "Alternative Protein Name(s)"

# Ensure only unambiguous translations are added by counting
# the number of matches

# For some reason the following loop needs to be executed twice
# in order to work correctly

for i in kinases_2:
    
    matches = 0
    match = []
    
    for n, j in enumerate( kinases_df[ 'Alternative_Protein_Name(s)' ] ):
        if i[ 0 ] in str( j ).upper() and i[ 1 ] in str( j ).upper():
            matches += 1
            match = kinases_df[ 'UniProt_ID' ][ n ]
    if matches == 1 and i in kinases_2:
        kinase_dict[ str( i[ 2 ] )] = match
        kinases_2.remove( i )     

for i in kinases_2:
    
    matches = 0
    match = []
    
    for n, j in enumerate( kinases_df[ 'Alternative_Protein_Name(s)' ] ):
        if i[ 0 ] in str( j ).upper() and i[ 1 ] in str( j ).upper():
            matches += 1
            match = kinases_df[ 'UniProt_ID' ][ n ]
    if matches == 1 and i in kinases_2:
        kinase_dict[ str( i[ 2 ] )] = match
        kinases_2.remove( i )          

In [None]:
# Check whether the first part of the kinase name exactly matches
# "Entry name" minus "_HUMAN"

for i in kinases_2:
    for n, j in enumerate( kinases_df[ 'Entry_name' ] ):
        if i[ 0 ] == j[ : -6 ].upper(): # Crop "_HUMAN" from entry name
            kinase_dict[ str( i[ 2 ] )] = kinases_df[ 'UniProt_ID' ][ n ]
            kinases_2.remove( i ) 

In the phospohosite table, convert the current "kinases" ID to uppercase, in order to allow translation

In [None]:
uppercase_kinase = []

for i in phosphosite_2_df.kinases:
    uppercase_kinase.append( str( i ).upper() )

uppercase_kinase = pd.Series( uppercase_kinase )

phosphosite_2_df = phosphosite_2_df.assign( kinases = uppercase_kinase )

Remove "NAN" from dictionary

In [None]:
kinase_dict.pop( "NAN" )

Add kinase UniProt ID column to data frame

In [None]:
uniprot_id = []

# If the kinase can be translated, add the UniProt ID to the column
# Otherwise add an empty string

for n, i in enumerate( phosphosite_2_df.kinases ):
    if i in kinase_dict.keys():
        kinase = kinase_dict.get( str( i ) )
        uniprot_id.append( kinase )
    else:
        uniprot_id.append( "" )

uniprot_id = pd.Series(uniprot_id)

phosphosite_2_df = phosphosite_2_df.assign(ACC_ID = uniprot_id)

Using the UniProt ID and the phosphosite amino acid residue information, add a phosphosite ID column to act as a Foreign Key for several different phosphosite-related tables in the database

In [None]:
phos_id = []

for n, i in phosphosite_2_df.iterrows():
    phos_id.append( phosphosite_2_df.acc[ n ].upper() + "(" + phosphosite_2_df.code[ n ].upper() + str( phosphosite_2_df.position[ n ]) + ")" )

phos_id = pd.Series( phos_id )

phosphosite_2_df = phosphosite_2_df.assign( PHOS_ID = phos_id )

Remove unnecessary columns

In [None]:
# Remove columns "source" and "entry_date"

phosphosite_2_df = phosphosite_2_df.drop( [ 'source', 'entry_date' ],
                                         axis = 1 )

phosphosite_2_df = phosphosite_2_df.reset_index( drop = True )

Write to CSV

In [None]:
phosphosite_2_df.to_csv( "phosphosites_2.csv", index = False)