# Figure 1 (Initial Data Analysis)

### Standard imports and loading data

In [2]:
import pandas as pd
import numpy as np
import xlrd
from bokeh.io import output_notebook, show
from bokeh.plotting import figure
from bokeh.layouts import row, column
from bokeh.palettes import Viridis8
from bokeh.models import Span
output_notebook()

## B_thai No Glucose Analysis

In [3]:
rpal = pd.read_excel("Biod_Rpal_Global_proteins.xls")

### Find standard biological variability by using the standard deviations of each time point

In [4]:
rpal_stdev = pd.DataFrame()
rpal_stdev['tp1'] = np.log(rpal[['An_T11', 'An_T12', 'An_T13', 'An_T14']].std(axis=1)).replace([np.inf, -np.inf], 0)
rpal_stdev['tp2'] = np.log(rpal[['An_T21', 'An_T22', 'An_T23', 'An_T24']].std(axis=1)).replace([np.inf, -np.inf], 0)
rpal_stdev['tp3'] = np.log(rpal[['An_T31', 'An_T32', 'An_T33', 'An_T34']].std(axis=1)).replace([np.inf, -np.inf], 0)
rpal_stdev['tp4'] = np.log(rpal[['An_T41', 'An_T42', 'An_T43', 'An_T44']].std(axis=1)).replace([np.inf, -np.inf], 0)
rpal_stdev['tp5'] = np.log(rpal[['An_T51', 'An_T52', 'An_T53', 'An_T54']].std(axis=1)).replace([np.inf, -np.inf], 0)
rpal_stdev = rpal_stdev.dropna(axis = 0)

### Find the typical biological variability--the number below which 99% of standard deviations (across all time points) lie

In [5]:
all_stdev = pd.melt(rpal_stdev)['value'].tolist()
all_stdev.sort()
logcutoff = all_stdev[int(len(all_stdev)*0.99)]
cutoff = 2**logcutoff
print(cutoff)
print(logcutoff)

0.5061564081382521
-0.9823448317509463


## Plot all standard deviations combined

In [5]:
# Build the basic figure
std_p = figure(width=1000, plot_height=1000, title = 'Distribution of Standard Deviations for All Time Points',
                  x_axis_label = 'Standard Deviation (Log Scale)',
                  y_axis_label = 'Number of Proteins')

# Create the histogram and add it to the graph
tp1_std_hist, edges = np.histogram(all_stdev, density=False, bins=100)

# Add each histogram to the graph
std_p.quad(top=tp1_std_hist, bottom=0, left=edges[:-1], right=edges[1:], fill_alpha=0.70, line_color='white')

# Add significance cutoff line
vline = Span(location = logcutoff, dimension = 'height', line_width=2)
std_p.renderers.extend([vline])

# Plot Styling
std_p.xgrid.visible = False
std_p.ygrid.visible = False
std_p.xaxis.minor_tick_line_color = None
std_p.yaxis.minor_tick_line_color = None
std_p.legend.click_policy = 'hide'
std_p.title.text_font_size = '25pt'
std_p.title.align = 'center'
std_p.xaxis.axis_label_text_font_size = '30pt'
std_p.yaxis.axis_label_text_font_size = '30pt'
std_p.xaxis.major_label_text_font_size = "15pt"
std_p.yaxis.major_label_text_font_size = "15pt"
std_p.legend.label_text_font_size = '15pt'

show(std_p)

You are attemptings to set `plot.legend.click_policy` on a plot that has zero legends added, this will have no effect.

Before legend properties can be set, you must add a Legend explicitly, or call a glyph method with the 'legend' parameter set.

You are attemptings to set `plot.legend.label_text_font_size` on a plot that has zero legends added, this will have no effect.

Before legend properties can be set, you must add a Legend explicitly, or call a glyph method with the 'legend' parameter set.



## Plot all the standard deviations on top of each other

In [6]:
# Build the basic figure
std_p = figure(width=1000, plot_height=1000, title = 'Distribution of Standard Deviations at Each Time Point',
                  x_axis_label = 'Standard Deviation (Log Scale)',
                  y_axis_label = 'Number of Proteins')

# Create the histograms for each time point
tp1_std_hist, edges1 = np.histogram(rpal_stdev['tp1'], density=False, bins=100)
tp2_std_hist, edges2 = np.histogram(rpal_stdev['tp2'], density=False, bins=100)
tp3_std_hist, edges3 = np.histogram(rpal_stdev['tp3'], density=False, bins=100)
tp4_std_hist, edges4 = np.histogram(rpal_stdev['tp4'], density=False, bins=100)
tp5_std_hist, edges5 = np.histogram(rpal_stdev['tp5'], density=False, bins=100)

# Add each histogram to the graph
for data, edge, name, color in zip([tp1_std_hist, tp2_std_hist, tp3_std_hist, tp4_std_hist, tp5_std_hist], [edges1, edges2, edges3, edges4, edges5], ['Time Point 1', 'Time Point 2', 'Time Point 3', 'Time Point 4', 'Time Point 5'], Viridis8):
    std_p.quad(top=data, bottom=0, left=edge[:-1], right=edge[1:], color=color, fill_alpha = 0.25, legend=name)

# Add significance cutoff line
vline = Span(location = logcutoff, dimension = 'height', line_width=2)
std_p.renderers.extend([vline])

# Plot Styling
std_p.xgrid.visible = False
std_p.ygrid.visible = False
std_p.xaxis.minor_tick_line_color = None
std_p.yaxis.minor_tick_line_color = None
std_p.legend.click_policy = 'hide'
std_p.title.text_font_size = '25pt'
std_p.title.align = 'center'
std_p.xaxis.axis_label_text_font_size = '30pt'
std_p.yaxis.axis_label_text_font_size = '30pt'
std_p.xaxis.major_label_text_font_size = "15pt"
std_p.yaxis.major_label_text_font_size = "15pt"
std_p.legend.label_text_font_size = '15pt'

show(std_p)

## Calculate the means for each time point

In [6]:
rpal_mean = pd.DataFrame()
rpal_mean['tp1'] = rpal[['An_T11', 'An_T12', 'An_T13', 'An_T14']].mean(axis=1)
rpal_mean['tp2'] = rpal[['An_T21', 'An_T22', 'An_T23', 'An_T24']].mean(axis=1)
rpal_mean['tp3'] = rpal[['An_T31', 'An_T32', 'An_T33', 'An_T34']].mean(axis=1)
rpal_mean['tp4'] = rpal[['An_T41', 'An_T42', 'An_T43', 'An_T44']].mean(axis=1)
rpal_mean['tp5'] = rpal[['An_T51', 'An_T52', 'An_T53', 'An_T54']].mean(axis=1)
rpal_mean = rpal_mean.dropna(axis = 0)

## Cluster the proteins (manually) based on increasing/decreasing/no change/spike at time point

In [7]:
cutoff_95 = 0.9007950046165919
cutoff_99 = 1.7792177941476046

rpal_mean['max'] = rpal_mean[['tp1', 'tp2', 'tp3', 'tp4', 'tp5']].max(axis=1)
rpal_mean['min'] = rpal_mean[['tp1', 'tp2', 'tp3', 'tp4', 'tp5']].min(axis=1)
rpal_mean['diff_max_min'] = rpal_mean['max'] - rpal_mean['min']
bin_rejects = rpal_mean.loc[rpal_mean['diff_max_min'] < cutoff]
significant_change = rpal_mean.loc[rpal_mean['diff_max_min'] >= cutoff]
significant_change['max'] = significant_change[['tp1', 'tp2', 'tp3', 'tp4', 'tp5']].max(axis=1)

bin_1 = significant_change[significant_change['tp1'] == significant_change['max']]
bin_2 = significant_change[significant_change['tp2'] == significant_change['max']]
bin_3 = significant_change[significant_change['tp3'] == significant_change['max']]
bin_4 = significant_change[significant_change['tp4'] == significant_change['max']]
bin_5 = significant_change[significant_change['tp5'] == significant_change['max']]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


## Create a simple chart to show the number of proteins in each category

In [8]:
bins = ['Early_Log', 'Mid_Log', 'Late_Log', 'Early_Stationary', 'Late_Stationary']
counts = [len(bin_1), len(bin_2), len(bin_3), len(bin_4), len(bin_5)]
print(counts)

p = figure(x_range = bins, plot_height = 500,
           x_axis_label = 'Growth Phase of Highest Abundance',
           y_axis_label = 'Number of Proteins')

p.vbar(x = bins, top = counts, width = 0.9, color = Viridis8[0:5])

p.xgrid.visible = False
p.ygrid.visible = False
p.title.text_font_size = '20pt'
p.title.align = 'center'
p.xaxis.axis_label_text_font_size = '15pt'
p.yaxis.axis_label_text_font_size = '15pt'
p.yaxis.minor_tick_line_color = None

show(p)

[90, 33, 69, 80, 315]


## Plot each category

In [10]:
p = figure(plot_width=400, plot_height=400)

for i in range(0, len(bin_rejects)):
    p.line([1, 2, 3, 4, 5], [bin_rejects.tp1.iloc[i], bin_rejects.tp2.iloc[i], bin_rejects.tp3.iloc[i], bin_rejects.tp4.iloc[i], bin_rejects.tp5.iloc[i]])
    
p.xaxis.axis_label = 'Time Point'
p.yaxis.axis_label = 'Relative Protein Abundance'
p.xgrid.visible = False
p.ygrid.visible = False
p.xaxis.major_label_overrides = {1: 'Early_Log', 2: 'Mid_Log', 3: 'Late_Log', 4: 'Early_Stationary', 5: 'Late_Stationary'}
p.xaxis.minor_tick_line_color = None
p.yaxis.minor_tick_line_color = None
    
show(p)

In [11]:
p = figure(plot_width=1000, plot_height=1000,
           x_axis_label = 'Growth Phase',
           y_axis_label = 'Relative Protein Abundance')

for i in range(0, len(bin_1)):
    p.line([1, 2, 3, 4, 5], [bin_1.tp1.iloc[i], bin_1.tp2.iloc[i], bin_1.tp3.iloc[i], bin_1.tp4.iloc[i], bin_1.tp5.iloc[i]])
    
p.xgrid.visible = False
p.ygrid.visible = False
p.title.text_font_size = '32pt'
p.title.align = 'center'
p.xaxis.axis_label_text_font_size = '30pt'
p.yaxis.axis_label_text_font_size = '30pt'
p.xaxis.major_label_text_font_size = "25pt"
p.xaxis.major_label_overrides = {1: 'enter log', 2: 'mid log', 3: 'late log', 4: 'early stat.', 5: 'late stat.'}
p.yaxis.major_label_text_font_size = "15pt"
p.xaxis.minor_tick_line_color = None
p.yaxis.minor_tick_line_color = None
    
show(p)

In [12]:
p = figure(plot_width=1000, plot_height=1000,
           x_axis_label = 'Growth Phase',
           y_axis_label = 'Relative Protein Abundance')

for i in range(0, len(bin_2)):
    p.line([1, 2, 3, 4, 5], [bin_2.tp1.iloc[i], bin_2.tp2.iloc[i], bin_2.tp3.iloc[i], bin_2.tp4.iloc[i], bin_2.tp5.iloc[i]])
    
p.xgrid.visible = False
p.ygrid.visible = False
p.title.text_font_size = '32pt'
p.title.align = 'center'
p.xaxis.axis_label_text_font_size = '30pt'
p.yaxis.axis_label_text_font_size = '30pt'
p.xaxis.major_label_text_font_size = "25pt"
p.xaxis.major_label_overrides = {1: 'enter log', 2: 'mid log', 3: 'late log', 4: 'early stat.', 5: 'late stat.'}
p.yaxis.major_label_text_font_size = "15pt"
p.xaxis.minor_tick_line_color = None
p.yaxis.minor_tick_line_color = None
    
show(p)

In [13]:
p = figure(plot_width=1000, plot_height=1000,
           x_axis_label = 'Growth Phase',
           y_axis_label = 'Relative Protein Abundance')

for i in range(0, len(bin_3)):
    p.line([1, 2, 3, 4, 5], [bin_3.tp1.iloc[i], bin_3.tp2.iloc[i], bin_3.tp3.iloc[i], bin_3.tp4.iloc[i], bin_3.tp5.iloc[i]])
    
p.xgrid.visible = False
p.ygrid.visible = False
p.title.text_font_size = '32pt'
p.title.align = 'center'
p.xaxis.axis_label_text_font_size = '30pt'
p.yaxis.axis_label_text_font_size = '30pt'
p.xaxis.major_label_text_font_size = "25pt"
p.xaxis.major_label_overrides = {1: 'enter log', 2: 'mid log', 3: 'late log', 4: 'early stat.', 5: 'late stat.'}
p.yaxis.major_label_text_font_size = "15pt"
p.xaxis.minor_tick_line_color = None
p.yaxis.minor_tick_line_color = None
    
show(p)

In [14]:
p = figure(plot_width=1000, plot_height=1000,
           x_axis_label = 'Growth Phase',
           y_axis_label = 'Relative Protein Abundance')

for i in range(0, len(bin_4)):
    p.line([1, 2, 3, 4, 5], [bin_4.tp1.iloc[i], bin_4.tp2.iloc[i], bin_4.tp3.iloc[i], bin_4.tp4.iloc[i], bin_4.tp5.iloc[i]])
    
p.xgrid.visible = False
p.ygrid.visible = False
p.title.text_font_size = '32pt'
p.title.align = 'center'
p.xaxis.axis_label_text_font_size = '30pt'
p.yaxis.axis_label_text_font_size = '30pt'
p.xaxis.major_label_text_font_size = "25pt"
p.xaxis.major_label_overrides = {1: 'enter log', 2: 'mid log', 3: 'late log', 4: 'early stat.', 5: 'late stat.'}
p.yaxis.major_label_text_font_size = "15pt"
p.xaxis.minor_tick_line_color = None
p.yaxis.minor_tick_line_color = None

show(p)

In [15]:
p = figure(plot_width=1000, plot_height=1000,
           x_axis_label = 'Growth Phase',
           y_axis_label = 'Relative Protein Abundance')

for i in range(0, len(bin_5)):
    p.line([1, 2, 3, 4, 5], [bin_5.tp1.iloc[i], bin_5.tp2.iloc[i], bin_5.tp3.iloc[i], bin_5.tp4.iloc[i], bin_5.tp5.iloc[i]])
    
p.xgrid.visible = False
p.ygrid.visible = False
p.title.text_font_size = '32pt'
p.title.align = 'center'
p.xaxis.axis_label_text_font_size = '30pt'
p.yaxis.axis_label_text_font_size = '30pt'
p.xaxis.major_label_text_font_size = "25pt"
p.xaxis.major_label_overrides = {1: 'enter log', 2: 'mid log', 3: 'late log', 4: 'early stat.', 5: 'late stat.'}
p.yaxis.major_label_text_font_size = "15pt"
p.xaxis.minor_tick_line_color = None
p.yaxis.minor_tick_line_color = None
    
show(p)

## Combine all into a single plot using best-fit lines

In [16]:
bin_1.loc['mean'] = bin_1.mean()
bin_2.loc['mean'] = bin_2.mean()
bin_3.loc['mean'] = bin_3.mean()
bin_4.loc['mean'] = bin_4.mean()
bin_5.loc['mean'] = bin_5.mean()

columns = bin_1.columns
bestfit_lines = pd.DataFrame(columns = columns)
bestfit_lines.loc['bin1_mean'] = bin_1.loc['mean']
bestfit_lines.loc['bin2_mean'] = bin_2.loc['mean']
bestfit_lines.loc['bin3_mean'] = bin_3.loc['mean']
bestfit_lines.loc['bin4_mean'] = bin_4.loc['mean']
bestfit_lines.loc['bin5_mean'] = bin_5.loc['mean']

bestfit_lines

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame

Se

Unnamed: 0,tp1,tp2,tp3,tp4,tp5,max,min,diff_max_min
bin1_mean,0.310476,0.079168,-0.123585,-0.148868,-0.418527,0.310476,-0.490348,0.800824
bin2_mean,0.095943,0.244022,0.096302,-0.07863,-0.494353,0.244022,-0.504712,0.748733
bin3_mean,-0.071415,0.121547,0.316886,0.121998,-0.447154,0.316886,-0.46642,0.783306
bin4_mean,-0.232368,-0.150816,-0.00909,0.378183,-0.063847,0.378183,-0.369986,0.748169
bin5_mean,-0.319029,-0.369213,-0.354814,-0.148556,0.458551,0.458551,-0.434654,0.893206


In [17]:
p = figure(plot_width=1000, plot_height=1000,
           x_axis_label = 'Time Point',
           y_axis_label = 'Relative Protein Abundance')
growth_stages = ['Early Log', 'Mid Log', 'Late Log', 'Early Stationary', 'Late Stationary']

for i in range(0, len(bestfit_lines)):
    p.line([1, 2, 3, 4, 5], [bestfit_lines.tp1.iloc[i], bestfit_lines.tp2.iloc[i], bestfit_lines.tp3.iloc[i], bestfit_lines.tp4.iloc[i], bestfit_lines.tp5.iloc[i]], color=Viridis8[i], line_width=7, legend=growth_stages[i])
    
p.xgrid.visible = False
p.ygrid.visible = False
p.title.align = 'center'
p.xaxis.axis_label_text_font_size = '20pt'
p.yaxis.axis_label_text_font_size = '20pt'
p.xaxis.major_label_text_font_size = '15pt'
p.xaxis.major_label_overrides = {1: 'Early_Log', 2: 'Mid_Log', 3: 'Late_Log', 4: 'Early_Stationary', 5: 'Late_Stationary'}
p.yaxis.major_label_text_font_size = "15pt"
p.xaxis.minor_tick_line_color = None
p.yaxis.minor_tick_line_color = None

show(p)