-
-
Notifications
You must be signed in to change notification settings - Fork 135
/
Copy pathpp_validate.R
211 lines (206 loc) · 8.7 KB
/
pp_validate.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
# Part of the rstanarm package for estimating model parameters
# Copyright (C) 2016, 2017 Trustees of Columbia University
# Copyright (C) 2005 Samantha Cook
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 3
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
#
#' Model validation via simulation
#'
#' The \code{pp_validate} function is based on the methods described in
#' Cook, Gelman, and Rubin (2006) for validating software developed to fit
#' particular Bayesian models. Here we take the perspective that models
#' themselves are software and thus it is useful to apply this validation
#' approach to individual models.
#'
#' @export
#' @templateVar stanregArg object
#' @template args-stanreg-object
#' @param nreps The number of replications to be performed. \code{nreps} must be
#' sufficiently large so that the statistics described below in Details are
#' meaningful. Depending on the model and the size of the data, running
#' \code{pp_validate} may be slow. See also the Note section below for advice
#' on avoiding numerical issues.
#' @param seed A seed passed to Stan to use when refitting the model.
#' @param ... Currently ignored.
#'
#' @details
#' We repeat \code{nreps} times the process of simulating parameters and data
#' from the model and refitting the model to this simulated data. For each of
#' the \code{nreps} replications we do the following:
#' \enumerate{
#' \item Refit the model but \emph{without} conditioning on the data (setting
#' \code{prior_PD=TRUE}), obtaining draws \eqn{\theta^{true}}{\theta_true}
#' from the \emph{prior} distribution of the model parameters.
#' \item Given \eqn{\theta^{true}}{\theta_true}, simulate data \eqn{y^\ast}{y*}
#' from the \emph{prior} predictive distribution (calling
#' \code{\link{posterior_predict}} on the fitted model object obtained in step
#' 1).
#' \item Fit the model to the simulated outcome \eqn{y^\ast}{y*}, obtaining
#' parameters \eqn{\theta^{post}}{\theta_post}.
#' }
#' For any individual parameter, the quantile of the "true" parameter value with
#' respect to its posterior distribution \emph{should} be uniformly distributed.
#' The validation procedure entails looking for deviations from uniformity by
#' computing statistics for a test that the quantiles are uniformly distributed.
#' The absolute values of the computed test statistics are plotted for batches
#' of parameters (e.g., non-varying coefficients are grouped into a batch called
#' "beta", parameters that vary by group level are in batches named for the
#' grouping variable, etc.). See Cook, Gelman, and Rubin (2006) for more details
#' on the validation procedure.
#'
#' @note In order to make it through \code{nreps} replications without running
#' into numerical difficulties you may have to restrict the range for randomly
#' generating initial values for parameters when you fit the \emph{original}
#' model. With any of \pkg{rstanarm}'s modeling functions this can be done by
#' specifying the optional argument \code{init_r} as some number less than the
#' default of \eqn{2}.
#'
#' @return A ggplot object that can be further customized using the
#' \pkg{ggplot2} package.
#'
#' @references
#' Cook, S., Gelman, A., and Rubin, D.
#' (2006). Validation of software for Bayesian models using posterior quantiles.
#' \emph{Journal of Computational and Graphical Statistics}. 15(3), 675--692.
#'
#' @seealso
#' \code{\link{pp_check}} for graphical posterior predictive checks and
#' \code{\link{posterior_predict}} to draw from the posterior predictive
#' distribution.
#'
#' \code{\link[bayesplot:bayesplot-colors]{color_scheme_set}} to change the color scheme of the
#' plot.
#'
#' @examples
#' if (.Platform$OS.type != "windows" || .Platform$r_arch != "i386") {
#' \dontrun{
#' if (!exists("example_model")) example(example_model)
#' try(pp_validate(example_model)) # fails with default seed / priors
#' }
#' }
#' @importFrom ggplot2 rel geom_point geom_segment scale_x_continuous element_line
#'
pp_validate <- function(object, nreps = 20, seed = 12345, ...) {
# based on Samantha Cook's BayesValidate::validate
quant <- function(draws) {
n <- length(draws)
rank_theta <- c(1:n)[order(draws) == 1] - 1
quants <- (rank_theta + 0.5) / n
return(quants)
}
validate_stanreg_object(object)
if (is.stanmvreg(object))
STOP_if_stanmvreg("'pp_validate'")
if (nreps < 2)
stop("'nreps' must be at least 2.")
dims <- object$stanfit@par_dims[c("alpha", "beta", "b", "aux", "cutpoints", "theta_L")]
dims <- dims[!sapply(dims, is.null)]
dims <- sapply(dims, prod)
dims <- dims[dims > 0]
if ("b" %in% names(dims)) {
mark <- which(names(dims) == "b")
vals <- sapply(ranef(object), function(x) length(as.matrix(x)))
dims <- append(dims, values = vals, after = mark)
dims <- dims[-mark]
}
names(dims)[which(names(dims) == "theta_L")] <- "Sigma"
batches <- dims
params_batch <- names(dims)
num_batches <- length(batches)
num_params <- sum(dims)
batch_ind <- rep(0, num_batches + 1)
plot_batch <- rep(1, batches[1])
for (i in 1:num_batches)
batch_ind[i+1] <- batch_ind[i] + batches[i]
for (i in 2:num_batches)
plot_batch <- c(plot_batch, rep(i, batches[i]))
quantile_theta <- matrix(NA_real_, nrow = nreps, ncol = num_params + num_batches)
post <- suppressWarnings(update(object, prior_PD = TRUE, seed = seed, algorithm = "sampling",
warmup = 1000, iter = 1000 + 1, chains = nreps))
post_mat <- as.matrix(post)
data_mat <- posterior_predict(post)
constant <- apply(data_mat, 1, FUN = function(x) all(duplicated(x)[-1L]))
if (any(constant))
stop("'pp_validate' cannot proceed because some simulated outcomes are constant. ",
"Try again with better priors on the parameters.")
y <- get_y(object)
for (reps in 1:nreps) {
theta_true <- post_mat[reps, ]
data_rep <- data_mat[reps, ]
mf <- model.frame(object)
if (NCOL(mf[, 1]) == 2) { # binomial models
new_f <- update.formula(formula(object), cbind(ynew_1s, ynew_0s) ~ .)
ynew <- c(data_rep)
mf2 <- data.frame(mf[, -1], ynew_1s = ynew, ynew_0s = rowSums(y) - ynew)
mf <- get_all_vars(new_f, data = mf2)
} else {
new_f <- NULL
if (is.factor(y))
mf[, 1] <- factor(data_rep, levels = levels(y), ordered = is.ordered(y))
else
mf[, 1] <- c(data_rep)
}
update_args <- nlist(object, data = mf, seed)
if (!is.null(new_f))
update_args$formula <- new_f
theta_draws <- as.matrix(do.call("update", update_args))
if (!is.null(batches)){
for (i in 1:num_batches) {
if (batches[i] > 1) {
sel <- (batch_ind[i]+1):batch_ind[(i+1)]
theta_draws <- cbind(theta_draws,
apply(theta_draws[, sel], 1, mean))
theta_true <- c(theta_true, mean(theta_true[sel]))
} else {
theta_draws <- cbind(theta_draws, theta_draws[, (batch_ind[i]+1)])
theta_true <- c(theta_true, theta_true[(batch_ind[i]+1)])
}
}
}
theta_draws <- rbind(theta_true, theta_draws)
quantile_theta[reps, ] <- apply(theta_draws, 2, quant)
}
quantile_trans <- (apply(quantile_theta, 2, qnorm))^2
q_trans <- apply(quantile_trans, 2, sum)
p_vals <- pchisq(q_trans, df = nreps, lower.tail = FALSE)
z_stats <- abs(qnorm(p_vals))
if (is.null(batches)) {
adj_min_p <- num_params * min(p_vals)
} else {
z_batch <- z_stats[(num_params + 1):length(p_vals)]
p_batch <- p_vals[(num_params + 1):length(p_vals)]
adj_min_p <- num_batches * min(p_batch)
}
upper_lim <- max(max(z_stats + 1), 3.5)
plotdata <- data.frame(x = z_batch, y = params_batch)
scheme <- bayesplot::color_scheme_get()
ggplot(plotdata, aes_string(x = "x", y = "y")) +
geom_segment(
aes_string(x = "0", xend = "x", y = "y", yend = "y"),
color = scheme[["mid"]],
size = rel(1)
) +
geom_point(
size = rel(3),
shape = 21,
fill = scheme[["dark"]],
color = scheme[["dark_highlight"]]
) +
scale_x_continuous(limits = c(0, upper_lim), expand = c(0, 0)) +
xlab(expression("Absolute " * z[theta] * " Statistics")) +
theme_default() +
yaxis_title(FALSE) +
grid_lines(color = "gray", size = 0.1)
}