-
Notifications
You must be signed in to change notification settings - Fork 1
/
getAMR.R
222 lines (205 loc) · 10.7 KB
/
getAMR.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
#' Search for aberrantly methylated regions
#'
#' @description
#' `getAMR` returns a `GRanges` object with all the aberrantly methylated
#' regions (AMRs) for all samples in a data set.
#'
#' @details
#' In the provided data set, `getAMR` compares methylation beta values of each
#' sample with other samples to identify rare long-range methylation
#' aberrations. For `ramr.method=="IQR"`: for every genomic location (CpG) in
#' `data.ranges` the IQR-normalized deviation from the median value is
#' calculated, and all CpGs with such normalized deviation not smaller than the
#' `iqr.cutoff` are retained. For `ramr.method=="*beta"`: parameters of beta
#' distribution are estimated by means of `EnvStats::ebeta` or `ExtDist::eBeta`
#' functions, and then used to calculate the probability values, followed by the
#' filtering when all CpGs with p-values not greater than `qval.cutoff` are
#' retained. Another filtering is then performed to exclude all CpGs within
#' `exclude.range`. Next, the retained (significant) CpGs are merged within
#' the window of `merge.window`, and final filtering is applied to AMR genomic
#' ranges (by `min.cpgs` and `min.width`).
#'
#' @param data.ranges A `GRanges` object with genomic locations and
#' corresponding beta values included as metadata.
#' @param data.samples A character vector with sample names (a subset of
#' metadata column names). If `NULL` (the default), then all samples (metadata
#' columns) are included in the analysis.
#' @param ramr.method A character scalar: when ramr.method is "IQR" (the
#' default), the filtering based on interquantile range is used (`iqr.cutoff`
#' value is then used as a threshold). When "beta" or "wbeta" - filtering based
#' on fitting non-weighted (`EnvStats::ebeta`) or weighted (`ExtDist::eBeta`)
#' beta distributions, respectively, is used, and `pval.cutoff` or `qval.cutoff`
#' (if not `NULL`) is used as a threshold. For "wbeta", weights directly
#' correlate with bin contents (number of values per bin) and inversly - with
#' the distances from the median value, thus narrowing the estimated
#' distribution and emphasizing outliers.
#' @param iqr.cutoff A single integer >= 1. Methylation beta values differing
#' from the median value by more than `iqr.cutoff` interquartile ranges are
#' considered to be significant (the default: 5).
#' @param pval.cutoff A numeric scalar (the default: 5e-2). Bonferroni
#' correction of `pval.cutoff` by the length of the `data.samples` object is
#' used to calculate `qval.cutoff` if the latter is `NULL`.
#' @param qval.cutoff A numeric scalar. Used as a threshold for filtering based
#' on fitting non-weighted or weighted beta distributions: all p-values lower
#' than `qval.cutoff` are considered to be significant. If `NULL` (the default),
#' it is calculated using `pval.cutoff`
#' @param merge.window A positive integer. All significant (survived the
#' filtering stage) `data.ranges` genomic locations within this distance will be
#' merged to create AMRs (the default: 300).
#' @param min.cpgs A single integer >= 1. All AMRs containing less than
#' `min.cpgs` significant genomic locations are filtered out (the default: 7).
#' @param min.width A single integer >= 1 (the default). Only AMRs with the
#' width of at least `min.width` are returned.
#' @param exclude.range A numeric vector of length two. If not `NULL` (the
#' default), all `data.ranges` genomic locations with their median methylation
#' beta value within the `exclude.range` interval are filtered out.
#' @param cores A single integer >= 1. Number of processes for parallel
#' computation (the default: all but one cores). Results of parallel processing
#' are fully reproducible when the same seed is used (thanks to doRNG).
#' @param verbose boolean to report progress and timings (default: TRUE).
#' @param ... Further arguments to be passed to `EnvStats::ebeta` or
#' `ExtDist::eBeta` functions.
#' @return The output is a `GRanges` object that contains all the aberrantly
#' methylated regions (AMRs) for all `data.samples` samples in `data.ranges`
#' object. The following metadata columns may be present:
#' \itemize{
#' \item `revmap` -- integer list of significant CpGs (`data.ranges` genomic
#' locations) that are included in this AMR region
#' \item `ncpg` -- number of significant CpGs within this AMR region
#' \item `sample` -- contains an identifier of a sample to which
#' corresponding AMR belongs
#' \item `dbeta` -- average deviation of beta values for significant CpGs from
#' their corresponding median values
#' \item `pval` -- geometric mean of p-values for significant CpGs
#' \item `xiqr` -- average IQR-normalised deviation of beta values for
#' significant CpGs from their corresponding median values
#' }
#' @seealso \code{\link{plotAMR}} for plotting AMRs, \code{\link{getUniverse}}
#' for info on enrichment analysis, \code{\link{simulateAMR}} and
#' \code{\link{simulateData}} for the generation of simulated test data sets,
#' and `ramr` vignettes for the description of usage and sample data.
#' @examples
#' data(ramr)
#' getAMR(ramr.data, ramr.samples, ramr.method="beta",
#' min.cpgs=5, merge.window=1000, qval.cutoff=1e-3, cores=2)
#' @importFrom parallel detectCores makeCluster stopCluster
#' @importFrom doParallel registerDoParallel
#' @importFrom GenomicRanges mcols reduce
#' @importFrom EnvStats ebeta
#' @importFrom ExtDist eBeta pBeta
#' @importFrom matrixStats rowMedians rowIQRs
#' @importFrom foreach foreach
#' @importFrom doRNG %dorng%
#' @importFrom methods as is
#' @importFrom stats median pbeta
#' @export
getAMR <- function (data.ranges,
data.samples=NULL,
ramr.method="IQR",
iqr.cutoff=5,
pval.cutoff=5e-2,
qval.cutoff=NULL,
merge.window=300,
min.cpgs=7,
min.width=1,
exclude.range=NULL,
cores=max(1,parallel::detectCores()-1),
verbose=TRUE,
...)
{
if (!methods::is(data.ranges,"GRanges"))
stop("'data.ranges' must be a GRanges object")
if (is.null(data.samples))
data.samples <- colnames(GenomicRanges::mcols(data.ranges))
if (!all(data.samples %in% colnames(GenomicRanges::mcols(data.ranges))))
stop("'data.ranges' metadata must include 'data.samples'")
if (length(data.samples)<3)
stop("at least three 'data.samples' must be provided")
#####################################################################################
getPValues.beta <- function (data.chunk, ...) {
chunk.filt <- apply(data.chunk, 1, function (x) {
x.median <- stats::median(x, na.rm=TRUE)
x[is.na(x)] <- x.median
beta.fit <- suppressWarnings( EnvStats::ebeta(as.numeric(x), ...) )
pvals <- stats::pbeta(x, beta.fit$parameters[1], beta.fit$parameters[2])
pvals[x>x.median] <- 1 - pvals[x>x.median]
return(pvals)
})
return(t(chunk.filt))
}
getPValues.wbeta<- function (data.chunk, ...) {
chunk.filt <- apply(data.chunk, 1, function (x) {
x.median <- stats::median(x, na.rm=TRUE)
x[is.na(x)] <- x.median
# weight directly correlates with bin contents (number of values per bin)
# and inversly - with the distance from the median value, thus narrowing
# the estimated distribution and emphasizing outliers
c <- cut(x, c(0:100)/100)
b <- table(c)
w <- as.numeric(b[c]) * (1 - abs(x-x.median))
beta.fit <- suppressWarnings( ExtDist::eBeta(as.numeric(x), w, ...) )
pvals <- ExtDist::pBeta(x, params=beta.fit)
pvals[x>x.median] <- 1 - pvals[x>x.median]
return(pvals)
})
return(t(chunk.filt))
}
#####################################################################################
if (verbose) message("Identifying AMRs", appendLF=FALSE)
tm <- proc.time()
doParallel::registerDoParallel(cores)
cl <- parallel::makeCluster(cores)
universe <- getUniverse(data.ranges, merge.window=merge.window, min.cpgs=min.cpgs, min.width=min.width)
universe.cpgs <- unlist(universe$revmap)
betas <- as.matrix(mcols(data.ranges)[universe.cpgs,data.samples,drop=FALSE])
if (is.null(qval.cutoff))
qval.cutoff <- pval.cutoff/ncol(betas)
chunks <- split(seq_len(nrow(betas)), if (cores>1) cut(seq_len(nrow(betas)),cores) else 1)
medians <- foreach (chunk=chunks, .combine=c) %dorng% matrixStats::rowMedians(betas[chunk,], na.rm=TRUE)
if (ramr.method=="IQR") {
iqrs <- foreach (chunk=chunks, .combine=c) %dorng% matrixStats::rowIQRs(betas[chunk,], na.rm=TRUE)
betas.filtered <- (betas-medians)/iqrs
betas.filtered[abs(betas.filtered)<iqr.cutoff] <- NA
} else if (ramr.method=="beta") {
# multi-threaded EnvStats::ebeta (speed: mme=mmue>mle>>>fitdistrplus::fitdist)
betas.filtered <- foreach (chunk=chunks) %dorng% getPValues.beta(betas[chunk,], ...)
betas.filtered <- do.call(rbind, betas.filtered)
betas.filtered[betas.filtered>=qval.cutoff] <- NA
} else if (ramr.method=="wbeta") {
betas.filtered <- foreach (chunk=chunks) %dorng% getPValues.wbeta(betas[chunk,], ...)
betas.filtered <- do.call(rbind, betas.filtered)
betas.filtered[betas.filtered>=qval.cutoff] <- NA
} else {
stop("unknown 'ramr.method'")
}
if (!is.null(exclude.range))
betas.filtered[medians>=exclude.range[1] & medians<=exclude.range[2]] <- NA
getMergedRanges <- function (column) {
not.na <- which(!is.na(betas.filtered[,column]))
ranges <- GenomicRanges::reduce(data.ranges[universe.cpgs[not.na]], min.gapwidth=merge.window, with.revmap=TRUE)
if (length(ranges)>0) {
ranges$ncpg <- unlist(lapply(ranges$revmap, length))
ranges$sample <- column
ranges <- subset(ranges, `ncpg`>=min.cpgs & `width`>=max(1,min.width))
ranges$dbeta <- vapply(ranges$revmap, function (revmap) {
mean(betas[not.na[revmap],column,drop=FALSE] - medians[not.na[revmap]])
}, numeric(1))
if (ramr.method=="IQR") {
ranges$xiqr <- vapply(ranges$revmap, function (revmap) {
mean(betas.filtered[not.na[revmap],column,drop=FALSE], na.rm=TRUE)
}, numeric(1))
}
if (ramr.method=="beta" | ramr.method=="wbeta") {
ranges$pval <- vapply(ranges$revmap, function (revmap) {
return( 10**mean(log10(betas.filtered[not.na[revmap],column] + .Machine$double.xmin), na.rm=TRUE) )
}, numeric(1))
}
ranges$revmap <- lapply(ranges$revmap, function (i) {universe.cpgs[not.na[i]]})
}
return(ranges)
}
amr.ranges <- foreach (column=colnames(betas.filtered)) %dorng% getMergedRanges(column)
parallel::stopCluster(cl)
if (verbose) message(sprintf(" [%.3fs]",(proc.time()-tm)[3]), appendLF=TRUE)
return(unlist(methods::as(amr.ranges, "GRangesList")))
}