## Data Visualisation

In [None]:
#To do:


In [1]:
import pandas as pd
import numpy as np
import requests
import re
import sqlite3

In [3]:
df= pd.read_csv('az20.tsv', sep='\t', header=0, index_col=False, usecols=list(range(0,7)), na_values='inf') 
#reads text file, headings= row 0 in file, only uses the first 7 columns(the only colums with data), sets 'inf' as 'NaN'
#df.head() #Check dataframe is loaded correctly- the initial 5 rows
#df.tail() #Checking the last 5 rows
df.columns=['Substrate','Control_mean','Inhibitor_mean','Fold_change','p_value','ctrlCV','treatCV'] #rename columns
df=df.fillna({'ctrlCV':0, 'treatCV':0}) #replace NaN in variance columns with 0
df=df.dropna(axis='index', how='any') #drops any row with an NaN value
#remove rows where there is no phosphosite
df=df[~df.Substrate.str.contains("None")]
#remove rows where M is the residue- kinases only phosphorylate S, T and Y
M= r"\([M]\d+\)" #matches M in brackets with one or more digits
df=df[~df.Substrate.str.contains(M)] #drops rows with M residue
phos=df.Substrate.str.findall(r"\((.\d+)").apply(','.join, 1)
#matches any character followed by one or more digits, after a parenthesis
df.insert(1, "Phosphosite", phos, True) #inserts phosphosite data as the second column
df[["Substrate"]]=df.Substrate.str.extract(r"(.+)\(")
#extracts the substrate (one or more of any character) before a parenthesis 
df.head()

Unnamed: 0,Substrate,Phosphosite,Control_mean,Inhibitor_mean,Fold_change,p_value,ctrlCV,treatCV
0,1A24_HUMAN,S356,15279340.0,26434390.0,1.730074,0.554298,1.280092,0.902944
1,1A24_HUMAN,S359,15279340.0,26434390.0,1.730074,0.554298,1.280092,0.902944
7,AAAS,S495,3886162000.0,4023860000.0,1.035433,0.798476,0.118671,0.190204
9,AAGAB,S310,12372040.0,5187831.0,0.419319,0.532084,1.332381,1.732051
10,AAGAB,S311,13985210.0,5187831.0,0.370951,0.427256,1.106864,1.732051


In [5]:
#Find substrate gene name from uniprot API
def find_sub_gene(entry):
     if re.match(r".+_HUMAN", entry):
        URL = 'http://www.uniprot.org/uniprot/?query='+entry+'&columns=genes(PREFERRED)&format=tab'
        r = requests.get(URL)
        content = r.text.splitlines()
        gene_name=content[1:2]        
        return str(gene_name)  #returns gene as a string
    else:
        return entry           #if entry doesn't match regex, return the entry (gene name)

IndentationError: unindent does not match any outer indentation level (<ipython-input-5-25117058aafc>, line 9)

In [15]:
df.Substrate=df.apply(lambda row: find_sub_gene(row["Substrate"]), axis=1)
df.Substrate=df.Substrate.str.strip("[]").str.strip("''") #remove [] and ''
df.Substrate.replace("", np.nan, inplace=True) #for uniprot entries with no gene convert to Nan
df.dropna(subset=["Substrate"], inplace=True) #drop rows with NaN substrates
df.head()

Unnamed: 0,Substrate,Phosphosite,Control_mean,Inhibitor_mean,Fold_change,p_value,ctrlCV,treatCV
7,AAAS,S495,3886162000.0,4023860000.0,1.035433,0.798476,0.118671,0.190204
9,AAGAB,S310,12372040.0,5187831.0,0.419319,0.532084,1.332381,1.732051
10,AAGAB,S311,13985210.0,5187831.0,0.370951,0.427256,1.106864,1.732051
11,AAK1,S14,1018114000.0,963899000.0,0.94675,0.843711,0.354679,0.289161
12,AAK1,S624,29833400.0,199868500.0,6.699488,0.279794,1.268167,1.22601


In [29]:
def read_database(database):
    conn = sqlite3.connect(database) #connect to our database
    phosdf=pd.read_sql_query('SELECT Kin_Gene_Name, Substrate_Gene_Name, Substrate_Modified_Residue FROM PhosphoSites', conn) 
    #put the PhosphoSites table into a dataframe
    return phosdf
read_database('11.db')

Unnamed: 0,Kin_Gene_Name,Substrate_Gene_Name,Substrate_Modified_Residue
0,HRI,EIF2S1,S52
1,HRI,EIF2S1,S49
2,PKR,EIF2S1,S49
3,PKR,EIF2S1,S52
4,PKR,TP53,S392
5,PKR,MAPT,S356
6,PKR,MAPT,S262
7,PKR,CDK1,Y4
8,PKR,EIF2AK2,S242
9,PKR,EIF2AK2,Y162


In [30]:
#Find Kin_Gene_Name from Substrate_Gene_Name and Substrate_Modified_Residue
def kinase_retriever(substrate, phosphosite):
    

In [24]:
print(df.iloc[[220]])

    Substrate Phosphosite  Control_mean  Inhibitor_mean  Fold_change  \
349     AJUBA        T265   671121967.5     423617463.7     0.631208   

      p_value    ctrlCV   treatCV  
349  0.052397  0.082134  0.398783  


In [2]:
#log fold change
df["Log_Fold_change"]=np.log2(df["Fold_change"]) #log2 is usually used for log fold change
df["Log_p_value"]=-np.log10(df["p_value"]) #-log10 for p-value in a volcano plot
#df.insert()
df.head()

NameError: name 'df' is not defined

In [None]:
from bokeh.plotting import figure, show, output_notebook, output_file
from bokeh.models import Span, ColumnDataSource, HoverTool, Toggle
from bokeh.layouts import layout
from bokeh.embed import file_html

In [None]:
#basic volcano plot

#use output_notebook so it can be visualised in this notebook
output_notebook()
#output_file("test.html")
p = figure(plot_width=700, plot_height=500)
p.title.text="Volcano Plot"
p.title.text_font_size = "25px"
p.xaxis.axis_label ="Log Fold Change"
p.yaxis.axis_label ="-Log p-value"
#Significance thresholds:
sig5=Span(location=1.3, dimension='width', line_color='#800000', line_width=1.75, line_dash='dashed') #5%
sig1=Span(location=2, dimension='width', line_color='#2F4F4F', line_width=1.75, line_dash='dashed') #1%
toggle1=Toggle(label='1% Significance', button_type="success", active=True)
toggle1.js_link('active', sig1, 'visible')
toggle2=Toggle(label='5% Significance', button_type="success", active=True)
toggle2.js_link('active', sig5, 'visible')
p.add_layout(sig1) #adds horizontal line where points below line are non-sig fold changes(-log(0.05)=1.3)
p.add_layout(sig5)
#Tools
#hover=
#Data points
p.scatter(x=df.Log_Fold_change, y=df.Log_p_value)
show(layout([p], [toggle1, toggle2]))