-
Notifications
You must be signed in to change notification settings - Fork 1
/
extractPatterns.R
188 lines (184 loc) · 9.19 KB
/
extractPatterns.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
#' extractPatterns
#'
#' @description
#' This function extracts methylation patterns (epialleles) for a given genomic
#' region of interest.
#'
#' @details
#' The function matches reads (for paired-end sequencing alignment files - read
#' pairs as a single entity) to the genomic
#' region provided in a BED file/\code{\linkS4class{GRanges}} object, extracts
#' methylation statuses of bases within those reads, and returns a data frame
#' which can be used for plotting of DNA methylation patterns.
#'
#' @param bam BAM file location string OR preprocessed output of
#' \code{\link[epialleleR]{preprocessBam}} function. Read more about BAM file
#' requirements and BAM preprocessing at \code{\link{preprocessBam}}.
#' @param bed Browser Extensible Data (BED) file location string OR object of
#' class \code{\linkS4class{GRanges}} holding genomic coordinates for
#' regions of interest. It is used to match sequencing reads to the genomic
#' regions prior to eCDF computation. The style of seqlevels of BED file/object
#' must match the style of seqlevels of the BAM file/object used. The
#' BED/\code{\link[GenomicRanges]{GRanges}} rows are \strong{not} sorted
#' internally.
#' @param bed.row single non-negative integer specifying what `bed` region
#' should be included in the output (default: 1).
#' @param zero.based.bed boolean defining if BED coordinates are zero based
#' (default: FALSE).
#' @param match.min.overlap integer for the smallest overlap between read's and
#' BED/\code{\link[GenomicRanges]{GRanges}} start or end positions during
#' matching of capture-based NGS reads (default: 1).
#' @param extract.context string defining cytosine methylation context used
#' to report:
#' \itemize{
#' \item "CG" (the default) -- CpG cytosines (called as zZ)
#' \item "CHG" -- CHG cytosines (xX)
#' \item "CHH" -- CHH cytosines (hH)
#' \item "CxG" -- CG and CHG cytosines (zZxX)
#' \item "CX" -- all cytosines
#' }
#' @param min.context.freq real number in the range [0;1] (default: 0.01).
#' Genomic positions that are covered by smaller fraction of patterns (e.g.,
#' with erroneous context) won't be included in the report.
#' @param clip.patterns boolean if patterns should not extend over the edge of
#' `bed` region (default: FALSE).
#' @param strand.offset single non-negative integer specifying the offset of
#' bases at the reverse (-) strand compared to the forward (+) strand. Allows
#' to "merge" genomic positions when methylation is symmetric (in CG and CHG
#' contexts). By default, equals 1 for `extract.context`=="CG", 2 for "CHG", or
#' 0 otherwise.
#' @param highlight.positions integer vector with genomic positions of bases
#' to include in every overlapping pattern. Allows to visualize the
#' distribution of single-nucleotide variations (SNVs) among methylation
#' patterns. `highlight.positions` takes precedence if any of these positions
#' overlap with within-the-context positions of methylation pattern.
#' @param ... other parameters to pass to the
#' \code{\link[epialleleR]{preprocessBam}} function.
#' Options have no effect if preprocessed BAM data was supplied as an input.
#' @param verbose boolean to report progress and timings (default: TRUE).
#' @return \code{\link[data.table]{data.table}} object containing
#' per-read (pair) base methylation information for the genomic region of
#' interest. The report columns are:
#' \itemize{
#' \item seqnames -- read (pair) reference sequence name
#' \item strand -- read (pair) strand
#' \item start -- start of the read (pair)
#' \item end -- end of the read (pair)
#' \item nbase -- number of within-the-context bases for this read (pair)
#' \item beta -- beta value of this read (pair), calculated as a ratio of the
#' number of methylated within-the-context bases to the total number of
#' within-the-context bases
#' \item pattern -- hex representation of 64-bit FNV-1a hash calculated for
#' all reported base positions and bases in this read (pair). This
#' hash value depends only on included genomic positions and their methylation
#' call string chars (hHxXzZ) or nucleotides (ACGT, for highlighted bases
#' only), thus it is expected to be unique for every
#' methylation pattern, although equal for identical methylation patterns
#' independently on read (pair) start, end, or strand (when correct
#' `strand.offset` is given)
#' \item ... -- columns for each genomic position that hold corresponding
#' methylation call string char, or NA if position is not present in the read
#' (pair)
#' }
#' @seealso \code{\link{preprocessBam}} for preloading BAM data,
#' \code{\link{generateCytosineReport}} for methylation statistics at the level
#' of individual cytosines, \code{\link{generateBedReport}} for genomic
#' region-based statistics, \code{\link{generateVcfReport}} for evaluating
#' epiallele-SNV associations, \code{\link{generateBedEcdf}} for analysing the
#' distribution of per-read beta values, and `epialleleR` vignettes for the
#' description of usage and sample data.
#' @examples
#' # amplicon data
#' amplicon.bam <- system.file("extdata", "amplicon010meth.bam",
#' package="epialleleR")
#' amplicon.bed <- system.file("extdata", "amplicon.bed",
#' package="epialleleR")
#'
#' # let's get our patterns
#' patterns <- extractPatterns(bam=amplicon.bam, bed=amplicon.bed, bed.row=3)
#' nrow(patterns) # read pairs overlap genomic region of interest
#'
#' # these are positions of bases
#' base.positions <- grep("^[0-9]+$", colnames(patterns), value=TRUE)
#'
#' # let's make a summary table with counts of every pattern
#' patterns.summary <- patterns[, c(lapply(.SD, unique), .N),
#' by=.(pattern, beta), .SDcols=base.positions]
#' nrow(patterns.summary) # unique methylation patterns
#'
#' # let's melt and plot them
#' plot.data <- data.table::melt.data.table(patterns.summary,
#' measure.vars=base.positions, variable.name="pos", value.name="base")
#'
#' # continuous positions, nonunique patterns according to their counts
#' if (require("ggplot2", quietly=TRUE) & require("ggstance", quietly=TRUE)) {
#' ggplot(na.omit(plot.data)[N>1],
#' aes(x=as.numeric(as.character(pos)), y=factor(N),
#' group=pattern, color=factor(base, levels=c("z","Z")))) +
#' geom_line(color="grey", position=position_dodgev(height=0.5)) +
#' geom_point(position=position_dodgev(height=0.5)) +
#' scale_colour_grey(start=0.8, end=0) +
#' theme_light() +
#' labs(x="position", y="count", title="epialleles", color="base")
#' }
#'
#' # upset-like plot of all patterns, categorical positions, sorted by counts
#' if (require("ggplot2", quietly=TRUE) & require("gridExtra", quietly=TRUE)){
#' grid.arrange(
#' ggplot(na.omit(plot.data),
#' aes(x=pos, y=reorder(pattern,N),
#' color=factor(base, levels=c("z","Z")))) +
#' geom_line(color="grey") +
#' geom_point() +
#' scale_colour_grey(start=0.8, end=0) +
#' theme_light() +
#' scale_x_discrete(breaks=function(x){x[c(rep(FALSE,5), TRUE)]}) +
#' theme(axis.text.y=element_blank(), legend.position="none") +
#' labs(x="position", y=NULL, title="epialleles", color="base"),
#'
#' ggplot(unique(na.omit(plot.data)[, .(pattern, N, beta)]),
#' aes(x=N+0.5, y=reorder(pattern,N), alpha=beta, label=N)) +
#' geom_col() +
#' geom_text(alpha=0.5, nudge_x=0.2, size=3) +
#' scale_x_log10() +
#' theme_minimal() +
#' theme(axis.text.y=element_blank(), legend.position="none") +
#' labs(x="count", y=NULL, title=""),
#' ncol=2, widths=c(0.75, 0.25)
#' )
#' }
#'
#' @export
extractPatterns <- function (bam,
bed,
bed.row=1,
zero.based.bed=FALSE,
match.min.overlap=1,
extract.context=c("CG", "CHG", "CHH", "CxG", "CX"),
min.context.freq=0.01,
clip.patterns=FALSE,
strand.offset=c("CG"=1, "CHG"=2, "CHH"=0,
"CxG"=0, "CX"=0)[extract.context],
highlight.positions=c(),
...,
verbose=TRUE)
{
bed.row <- as.integer(bed.row[1])
extract.context <- match.arg(extract.context, extract.context)
strand.offset <- as.integer(strand.offset[1])
highlight.positions <- as.integer(highlight.positions)
if (!methods::is(bed, "GRanges"))
bed <- .readBed(bed.file=bed, zero.based.bed=zero.based.bed,
verbose=verbose)
bam <- preprocessBam(bam.file=bam, ..., verbose=verbose)
patterns <- .getPatterns(
bam.processed=bam, bed=bed, bed.row=bed.row,
match.min.overlap=match.min.overlap,
extract.context=paste0(.context.to.bases[[extract.context]]
[c("ctx.meth","ctx.unmeth")], collapse=""),
min.context.freq=min.context.freq, clip.patterns=clip.patterns,
strand.offset=strand.offset, highlight.positions=highlight.positions,
verbose=verbose
)
return(patterns)
}