## Data Visualisation

In [None]:
#provide csvs for dataframes of RKA and Identified kinases (name them by the plot they are for)
#put code into functions
#user data download

In [2]:
import pandas as pd
import re
import numpy as np #use this to log transform data
import requests
import sqlite3

In [3]:
def open_tsv(filename):
    df = pd.read_csv(filename, na_values='inf', sep='\t')
    if len(df.columns) > 7:
        df = pd.read_csv(filename, usecols=list(range(0, 7)), na_values='inf', sep='\t',header=0, index_col=False)
        df.columns=['Substrate','Control_mean','Inhibitor_mean','Fold_change','p_value','ctrlCV','treatCV']
        return df
    elif len(df.columns) < 7:
        df = pd.read_csv(filename, na_values='inf', sep='\t')
        df.columns = ["Substrate", "Control_mean", "Inhibitor_mean", "Fold_change", "p_value"]
        return df
    else: #len(df.columns)== 7
        df = pd.read_csv(filename, na_values='inf', sep='\t')
        df.columns=['Substrate','Control_mean','Inhibitor_mean','Fold_change','p_value','ctrlCV','treatCV']
        return df

In [None]:
#open_tsv('az20.tsv')

In [4]:
def initial_data_filter(df):
    if len(df.columns)== 7:
        df=df.fillna({'ctrlCV':0, 'treatCV':0}) #replace NaN in variance columns with 0
        df=df.dropna(axis='index', how='any')
        df=df[~df.Substrate.str.contains("None")]
        M= r"\([M]\d+\)" #matches M in brackets with one or more digits
        df=df[~df.Substrate.str.contains(M)] #drops rows with M residue
        phos=df.Substrate.str.findall(r"\((.\d+)").apply(','.join, 1)
        df.insert(1, "Phosphosite", phos, True) #inserts phosphosite data as the second column
        df[["Substrate"]]=df.Substrate.str.extract(r"(.+)\(")
        return df
    else:
        df=df.dropna(axis='index', how='any')
        df=df[~df.Substrate.str.contains("None")]
        M= r"\([M]\d+\)" #matches M in brackets with one or more digits
        df=df[~df.Substrate.str.contains(M)] #drops rows with M residue
        phos=df.Substrate.str.findall(r"\((.\d+)").apply(','.join, 1)
        df.insert(1, "Phosphosite", phos, True) #inserts phosphosite data as the second column
        df[["Substrate"]]=df.Substrate.str.extract(r"(.+)\(")
        return df

In [5]:
df=initial_data_filter(open_tsv('mux.tsv'))
df.head()

Unnamed: 0,Substrate,Phosphosite,Control_mean,Inhibitor_mean,Fold_change,p_value,ctrlCV,treatCV
0,ZZZ3,S82,383747800.0,335099800.0,0.873229,0.49062,0.248829,0.190148
1,ZZEF1,S1464,432438800.0,246629100.0,0.570322,0.182476,0.36126,0.590201
2,ZYX,S344,17347340000.0,13771400000.0,0.793862,0.422038,0.329156,0.317195
3,ZYX,S308,2504079000.0,1527156000.0,0.609867,0.116059,0.247867,0.446213
4,ZYX,S281,4483330000.0,4509845000.0,1.005914,0.968388,0.196722,0.15006


In [6]:
def find_sub_gene(entry):
    if re.match(r".+_HUMAN", entry):
        URL = 'http://www.uniprot.org/uniprot/?query==mnemonic:'+entry+'&columns=genes(PREFERRED)&format=tab'
        r = requests.get(URL)
        content = r.text.splitlines()
        gene_name=content[1:2]        
        return str(gene_name)  #returns gene as a string 
    else:
        return entry           #if entry doesn't match regex, return the entry (gene name)

In [7]:
def convert_to_gene(df):
    df.Substrate=df.apply(lambda row: find_sub_gene(row["Substrate"]), axis=1)
    df.Substrate=df.Substrate.str.strip("[]").str.strip("''") #remove [] and ''
    df.Substrate.replace("", np.nan, inplace=True)
    df.dropna(subset=["Substrate"], inplace=True)
    return df

In [8]:
df=initial_data_filter(open_tsv('mux.tsv'))
df=convert_to_gene(df)

In [14]:
def find_kinase(df):
    #Find Kin_Gene_Name from Substrate_Gene_Name and Substrate_Modified_Residue
    conn = sqlite3.connect("11.db") #connect to our database
    phosdf=pd.read_sql_query('SELECT Kin_Gene_Name, Substrate_Gene_Name, Substrate_Modified_Residue FROM PhosphoSites', conn) 
    df1= df.join(phosdf.set_index(['Substrate_Gene_Name', 'Substrate_Modified_Residue']), on =['Substrate', 'Phosphosite'])
    #join database dataframe with file dataframe where substrate gene name and modified residue are the index
    df1= df1.rename(columns={'Kin_Gene_Name': 'Kinase'})
    volplot_table=df1.to_csv('volplot_table.csv')
    return df1 #returns dataframe with Kinases (NaN results included)
    return volplot_table #returns dataframe as csv

In [12]:
def relative_kinase_activity(df1):
    #Find relative kinase activity
    kinase_sum= df1.groupby("Kinase").Control_mean.sum() #sum of each kinase
    total_sum=df1.Control_mean.sum() #total sum of kinases in the file
    Relative_Kinase_Activity=kinase_sum/total_sum
    #Relative kinase activity of inhibitor
    inhib_sum= df1.groupby("Kinase").Inhibitor_mean.sum() #sum of means for inhibitor data
    inhib_total=df1.Inhibitor_mean.sum()
    inhib_activity=inhib_sum/inhib_total
    kinasedf=pd.DataFrame({"Control_Mean":kinase_sum, "Relative_Kinase_Activity":Relative_Kinase_Activity, 
                       "Relative_Inhibited_Kinase_Activity":inhib_activity, "Inhibitor_Mean":inhib_sum})
    kinasedf = kinasedf.reset_index()
    kinasedf=kinasedf.sort_values(by='Relative_Kinase_Activity', ascending=False) #sort data by descending control mean value
    barplot_table=kinasedf.to_csv('barplot_table.csv')
    return kinasedf #returns sorted dataframe
    return barplot_table #returns sorted dataframe as csv

In [15]:
kdf=find_kinase(df)
kdf=relative_kinase_activity(kdf)
kdf.head()

Unnamed: 0,Kinase,Control_Mean,Inhibitor_Mean,Relative_Inhibited_Kinase_Activity,Relative_Kinase_Activity
25,CDK1,1730205000000.0,1700268000000.0,0.043062,0.043947
28,CDK2,1584095000000.0,1588941000000.0,0.040243,0.040235
39,CK2A1,1063678000000.0,1163821000000.0,0.029476,0.027017
106,PKACA,478619500000.0,404601300000.0,0.010247,0.012157
6,Akt1,470925400000.0,418708700000.0,0.010605,0.011961


In [40]:
from bokeh.plotting import figure, show, output_notebook
from bokeh.models import Span, ColumnDataSource, HoverTool, Toggle
from bokeh.layouts import layout
from bokeh.transform import dodge
#use output_notebook so it can be visualised in this notebook

In [16]:
def rka_barchart(kinasedf):
    #Bar graph of Relative Kinase Activity
    kinase_name=kinasedf.Kinase[0:25] #Top 25 Kinases
    src=ColumnDataSource(kinasedf)
    hover=HoverTool(tooltips=[('Kinase','@Kinase'), ('Relative Kinase Activity', '@Relative_Kinase_Activity'),
                          ('Relative Inhibited Kinase Activity','@Relative_Inhibited_Kinase_Activity')])
    plot1=figure(y_range=kinase_name, plot_height=1800)
    plot1.title.text="Relative Kinase Activity of the Top 25 Identified Kinases"
    plot1.title.text_font_size = "20px"
    plot1.xaxis.axis_label ="Relative Kinase Activity"
    plot1.x_range.start = 0
    plot1.yaxis.axis_label="Kinase"
    plot1.hbar(y=dodge('Kinase',-0.25, range=plot1.y_range), right='Relative_Kinase_Activity', height=0.45, source=src, color='#2F4F4F', legend='Relative Kinase Activity')
    plot1.hbar(y=dodge('Kinase',0.25, range=plot1.y_range), right='Relative_Inhibited_Kinase_Activity', height=0.45, source=src, color="#e84d60", legend='Relative Inhibited Kinase Activity')
    plot1.add_tools(hover)
    return plot1

In [42]:
#Data for volcano plot:
df1 = df1[df1.Fold_change != 0] #remove rows where fold change is 0
df1["Log_Fold_change"]=np.log2(df1["Fold_change"])
df1["Log_p_value"]=-np.log10(df1["p_value"])

In [17]:
def volplot_1(df1):
    #Data for volcano plot:
    df1 = df1[df1.Fold_change != 0] #remove rows where fold change is 0
    df1["Log_Fold_change"]=np.log2(df1["Fold_change"])
    df1["Log_p_value"]=-np.log10(df1["p_value"])
    #Volcano plot 1:
    source=ColumnDataSource(df1)
    vol_hover=HoverTool(tooltips=[('Kinase','@Kinase'), ('Substrate', '@Substrate'),
                             ('Modified Residue','@Phosphosite'), ('Fold Change','@Fold_change'), ('p-value', '@p_value')])
    p = figure(plot_width=700, plot_height=500)
    p.title.text="Volcano Plot of the Log Fold Change and Log p-value for All Kinases"
    p.title.text_font_size = "20px"
    p.xaxis.axis_label ="Log Fold Change"
    p.yaxis.axis_label ="-Log p-value"
    p.scatter(x='Log_Fold_change', y='Log_p_value', source=source)
    p.add_tools(vol_hover)
    #Significance thresholds:
    sig5=Span(location=1.3, dimension='width', line_color='#800000', line_width=1.75, line_dash='dashed') #5%
    sig1=Span(location=2, dimension='width', line_color='#2F4F4F', line_width=1.75, line_dash='dashed') #1%
    toggle1=Toggle(label='1% Significance', button_type="success", active=True)
    toggle1.js_link('active', sig1, 'visible')
    toggle2=Toggle(label='5% Significance', button_type="success", active=True)
    toggle2.js_link('active', sig5, 'visible')
    p.add_layout(sig1) #adds horizontal line where points below line are non-sig fold changes(-log(0.05)=1.3)
    p.add_layout(sig5)
    plot2=layout([p], [toggle1, toggle2])
    return plot2

In [47]:
#Data for volcano plot 2:
df2=df1.copy()
df2=df2.dropna(how='any')
df2.head()

Unnamed: 0,Substrate,Phosphosite,Control_mean,Inhibitor_mean,Fold_change,p_value,ctrlCV,treatCV,Kinase,Log_Fold_change,Log_p_value
182,ZMYM2,S305,1598739000.0,1839453000.0,1.150565,0.552138,0.27747,0.267264,PLK1,0.202342,0.257952
203,ZFP36L1,S54,29884160000.0,26361810000.0,0.882133,0.392399,0.197318,0.11427,PKACA,-0.180932,0.406272
203,ZFP36L1,S54,29884160000.0,26361810000.0,0.882133,0.392399,0.197318,0.11427,MAPKAPK2,-0.180932,0.406272
204,ZFP36L1,S334,217010900.0,416443700.0,1.918999,0.090441,0.595987,0.269975,PKACA,0.940354,1.043634
233,ZC3HC1,S395,13667250000.0,11390330000.0,0.833403,0.331212,0.110557,0.29951,CDK1,-0.262913,0.479893


In [18]:
def volplot_2(df1):
    #Data for volcano plot 2:
    df2=df1.copy()
    df2=df2.dropna(how='any')
    #Volcano plot 2:
    source=ColumnDataSource(df2)
    vol_hover=HoverTool(tooltips=[('Kinase','@Kinase'), ('Substrate', '@Substrate'),
                                 ('Modified Residue','@Phosphosite'), ('Fold Change','@Fold_change'), ('p-value', '@p_value')])
    p2 = figure(plot_width=700, plot_height=500)
    p2.title.text="Volcano Plot of the Log Fold Change and Log p-value for All Identified Kinases"
    p2.title.text_font_size = "15px"
    p2.xaxis.axis_label ="Log Fold Change"
    p2.yaxis.axis_label ="-Log p-value"
    p2.scatter(x='Log_Fold_change', y='Log_p_value', source=source)
    p2.add_tools(vol_hover)
    #Significance thresholds:
    sig5=Span(location=1.3, dimension='width', line_color='#800000', line_width=1.75, line_dash='dashed') #5%
    sig1=Span(location=2, dimension='width', line_color='#2F4F4F', line_width=1.75, line_dash='dashed') #1%
    toggle1=Toggle(label='1% Significance', button_type="success", active=True)
    toggle1.js_link('active', sig1, 'visible')
    toggle2=Toggle(label='5% Significance', button_type="success", active=True)
    toggle2.js_link('active', sig5, 'visible')
    p2.add_layout(sig1) #adds horizontal line where points below line are non-sig fold changes(-log(0.05)=1.3)
    p2.add_layout(sig5)
    plot3=layout([p2], [toggle1, toggle2])
    return plot3