/
stan_workup.py
164 lines (140 loc) · 4.82 KB
/
stan_workup.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
"""
Includes helper functions for analyzing and visualizing the outputs of
a Bayesian approach to enzyme kinetics in Stan.
Many of these approaches are derived from Justin Bois's courses at Caltech,
specifically bebi103 'Data Analysis in the Biological Sciences', with code
available at https://github.com/justinbois/bebi103.
These are often very specific to the data used in this analysis and not
all that generalizable. Certain naming schemes within functions should
be updated if used.
"""
# Data processing
import numpy as np
import pandas as pd
# Stats
from statsmodels.distributions.empirical_distribution import ECDF
# Data viz
import bokeh.io
import bokeh.plotting
def summarize(data, name=None, units=None, plot=True):
"""Summarizes an array of posterior sample draws.
Parameters:
-----------
data: pandas Series or numpy array
A 1D array containing MCMC posterior sample draws
name: string
Name of the parameter; automatically pulled if
data is a pd.Series and name=None
units: string
The units associated with the parameter
plot: bool, default True
Whether or not to display an empirical cumulative
distribution (ECDF) plot
Returns:
--------
summary: DataFrame
Contains the summary stats of the samples
"""
if isinstance(data, pd.Series) and name is None:
name = data.name
if units is not None:
name = f'{name} ({units})'
vals = np.round(np.quantile(data, [0.5, 0.025, 0.975]), 3)
df = pd.DataFrame({
'value': name,
'median': vals[0],
'95% CR': f'[{vals[1]}, {vals[2]}]'
},
index=[0])
if plot:
ecdf = ECDF(data.values)
p = bokeh.plotting.figure(plot_width=400, plot_height=300)
p.xaxis.axis_label = name
p.yaxis.axis_label = 'ECDF'
p.circle(ecdf.x, ecdf.y)
bokeh.io.show(p)
return df
def predictive_regression(
df,
ppc_name,
percentiles=[95, 75, 50, 25],
x_label='[indole] (µM)',
y_label='k (per sec)',
width=500,
height=400,
colors=[],
):
"""Displays the credible regions of the posterior, given
that the model contains posterior predictive checks (ppcs).
In this instance, ppcs are obtained on the range (1, n+1),
where 'n' corresponds to the concentration of substrate.
Parameters:
-----------
df: stan_samples.to_dataframe() object
DataFrame obtained from the sampling that contains
the ppc values
ppc_name: string
Name of base ppc variable in stan code. Samples are
written in the form ppc_name[i]
percentiles: list
Percentile regions to display, extending from the median
value. Takes values from 0 to 100 (probably exclusive...)
x_label: string
Name of the x-axis values for the regression
y_label: string
Name of the y-axis values (ppc samples) for the regression
width: int
Plot width
height: int
Plot height
colors: list-like
Colors for the percentiles, from first to last. Not really
validated against the percentiles list here, so keep that in
mind.
"""
# Build ptiles (percentile ranges)
ptiles = [pt for pt in percentiles if pt > 0]
ptiles = ([50 - pt/2 for pt in percentiles] + [50]
+ [50 + pt/2 for pt in percentiles[::-1]])
ptiles = np.array(ptiles) / 100
ptiles_str = [str(pt) for pt in ptiles]
# Get ppc dataframe quantiles
ppc_string = f'{ppc_name}['
columns = [col for col in df.columns if ppc_string in col]
df_ppc = df[columns].quantile(ptiles).reset_index().melt(id_vars='index', var_name='ppc', value_name=y_label)
# Convert to indole concentration (or whatever x-axis label)
df_ppc[x_label] = df_ppc['ppc'].apply(lambda x: int(x.replace(ppc_string, '').replace(']', '')) - 1)
# Set new index
df_ppc.set_index(['index', 'ppc'], inplace=True)
df_ppc.index.names = ['quantile', 'ppc']
# Prepare values
concs = df_ppc[x_label].unique()
if not colors:
colors = [
'#bfd3bf',
'#99b899',
'#739d73',
'#4d824d',
'#266826',
'#004D00',
]
# Set up plot
p = bokeh.plotting.figure(plot_width=width, plot_height=height)
p.xaxis.axis_label = x_label
p.yaxis.axis_label = y_label
# Add credible regions
for i, ptile in enumerate(ptiles_str[:len(ptiles_str)//2]):
p.varea(
x=concs,
y1=df_ppc.loc[float(ptile)][y_label],
y2=df_ppc.loc[float(ptiles_str[-i-1])][y_label],
color=colors[i],
)
# Plot the median
p.line(
x=concs,
y=df_ppc.loc[0.5][y_label],
line_width=2,
color=colors[-1],
)
return p