forked from GabrielHoffman/variancePartition
/
confidenceIntervals.R
104 lines (83 loc) · 4.7 KB
/
confidenceIntervals.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
#' Class VarParCIList
#'
#' Class \code{VarParCIList}
#'
#' @name VarParCIList-class
#' @rdname VarParCIList-class
#' @exportClass VarParCIList
setClass("VarParCIList", representation(method = "character"), contains = "list")
#' Linear mixed model confidence intervals
#'
#' Fit linear mixed model to estimate contribution of multiple sources of variation while simultaneously correcting for all other variables. Then perform parametric bootstrap sampling to get a 95\% confidence intervals for each variable for each gene.
#'
#' @param exprObj matrix of expression data (g genes x n samples), or \code{ExpressionSet}, or \code{EList} returned by \code{voom()} from the \code{limma} package
#' @param formula specifies variables for the linear (mixed) model. Must only specify covariates, since the rows of exprObj are automatically used as a response. e.g.: \code{~ a + b + (1|c)}
#' @param data \code{data.frame} with columns corresponding to formula
#' @param REML use restricted maximum likelihood to fit linear mixed model. default is FALSE. Strongly discourage against changing this option, but here for compatibility.
#' @param useWeights if TRUE, analysis uses heteroskedastic error estimates from \code{voom()}. Value is ignored unless exprObj is an \code{EList} from \code{voom()} or \code{weightsMatrix} is specified
#' @param control control settings for \code{lmer()}
#' @param nsim number of bootstrap datasets
#' @param ... Additional arguments for \code{lmer()} or l\code{m()}
#'
#' @return
#' \code{list()} of where each entry is the result for a gene. Each entry is a matrix of the 95\% confidence interval of the variance fraction for each variable
#'
#' @details
#' A linear mixed model is fit for each gene, and \code{bootMer()} is used to generate parametric bootstrap confidence intervals. \code{use.u=TRUE} is used so that the \eqn{\hat{u}} values from the random effects are used as estimated and are not re-sampled. This gives confidence intervals as if additional data were generated from these same current samples. Conversely, \code{use.u=FALSE} assumes that this dataset is a sample from a larger population. Thus it simulates \eqn{\hat{u}} based on the estimated variance parameter. This approach gives confidence intervals as if additional data were collected from the larger population from which this dataset is sampled. Overall, \code{use.u=TRUE} gives smaller confidence intervals that are appropriate in this case.
#'
#' @examples
#'
#' # load library
#' # library(variancePartition)
#'
#' library(BiocParallel)
#'
#' # load simulated data:
#' # geneExpr: matrix of gene expression values
#' # info: information/metadata about each sample
#' data(varPartData)
#'
#' # Specify variables to consider
#' # Age is continuous so we model it as a fixed effect
#' # Individual and Tissue are both categorical, so we model them as random effects
#' form <- ~ Age + (1 | Individual) + (1 | Tissue)
#'
#' # Compute bootstrap confidence intervals for each variable for each gene
#' resCI <- varPartConfInf(geneExpr[1:5, ], form, info, nsim = 100)
#'
#' @importFrom stats as.formula
#' @importFrom lme4 bootMer
#' @export
varPartConfInf <- function(exprObj, formula, data, REML = FALSE, useWeights = TRUE, control = vpcontrol, nsim = 1000, ...) {
if (!is.numeric(nsim) || nsim <= 0) {
stop("Must specify nsim as positive number")
}
# need derivs in boostrapping
control$calc.derivs <- TRUE
formula <- as.formula(formula)
# define bootstrap function
bootStrapFxn <- local(function(fit) {
if (is(fit, "lmerMod")) {
# use bootMer from lme4 to sample to bootstraps and refit the model
# use.u=TRUE says that the \hat{u} values from the random effects are used
bootRes <- bootMer(fit, calcVarPart, use.u = TRUE, nsim = nsim, parallel = "no", ncpus = 1)
apply(bootRes$t, 2, function(x) quantile(x, c(0.025, 0.975), na.rm = TRUE))
} else if (is(fit, "lm")) {
stop("Bootstrap of fixed effect ANOVA model not currently supported")
}
})
# fit the model and run the bootstrap function for each gene
res <- fitVarPartModel(exprObj = exprObj, formula = formula, data = data, REML = REML, useWeights = useWeights, fxn = bootStrapFxn, control = control, ...)
# res@method = "bootstrap"
return(res)
}
# form = ~ Age + Individual + Tissue
# fit = lm(geneExpr[1,]~ Age + Individual + Tissue, info, weights=1:nrow(info))
# bs2 <- function(formula, data, indices) {
# d <- data[indices,]
# fit2 <- lm(formula, data=d, weights=fit$weights[indices])
# return( calcVarPart(fit2) )
# }
# bootRes <- boot(data=data.frame(info, gene=geneExpr[1,]), statistic=bs2, R=1000, formula= paste("gene ~", as.character(form))
# apply( bootRes$t, 2, function(x) quantile(x, c(0.025, 0.975)))
# calcVarPart( fit )