/
qthist.R
169 lines (158 loc) · 5.75 KB
/
qthist.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
#' Calculate the widths of regions
#'
#' The length of a genomic region (the distance between the start and end)
#' is called the width
#' When given a query set of genomic regions, this function returns the width
#' @param query A GRanges or GRangesList object with query sets
#' @return A vector of the widths (end-start coordinates) of GRanges objects.
#' @export
#' @examples
#' regWidths = calcWidth(vistaEnhancers)
calcWidth = function(query) {
if (is(query, "GRangesList")) {
# Recurse over each GRanges object
x = lapply(query, calcWidth)
return(x) }
width(query)
}
#' Plot quantile-trimmed histogram
#'
#' Given the results from \code{calcWidth}, plots a histogram with
#' outliers trimmed.
#'
#' x-axis breaks for the frequency calculations are based on the "divisions"
#' results from helper function \code{calcDivisions}.
#'
#' @param x Data values to plot - vector or list of vectors
#' @param EndBarColor Color for the quantile bars on both ends of the graph
#' (optional)
#' @param MiddleBarColor Color for the bars in the middle of the graph
#' (optional)
#' @param quantThresh Quantile of data to be contained in each end bar (optional)
#' quantThresh values must be under .2, optimal size is under .1
#' @param bins The number of bins for the histogram to allocate data to.
#' (optional)
#' @param indep logical value which returns a list of plots that have had their
#' bins calculated independently; the normal version will plot them on the
#' same x and y axis.
#' @param numbers a logical indicating whether the raw numbers should be
#' displayed, rather than percentages (optional).
#' @return A ggplot2 plot object
#' @export
#' @examples
#' regWidths = calcWidth(vistaEnhancers)
#' qtHist = plotQTHist(regWidths)
#' qtHist2 = plotQTHist(regWidths, quantThresh=0.1)
plotQTHist = function(x, EndBarColor = "gray57", MiddleBarColor = "gray27",
quantThresh=NULL, bins=NULL, indep=FALSE, numbers=FALSE) {
if (indep) {
if (is(x, "list") | is(x, "List")) {
x = lapply(x, plotQTHist)
namesx = names(x)
for (i in seq_along(x)){
x[[i]] = x[[i]] + ggtitle(namesx[i])
}
return(x)
# you can use grid.arrange like this to plot these
# do.call("grid.arrange", x)
}
}
output = calcDivisions(x, quantThresh=quantThresh, bins=bins)
# if all x are the same - recalculate divisions
divisionCheck = output[["divisions"]]
if (length(divisionCheck) > length(unique(divisionCheck))){
if (length(unique(divisionCheck)) == 3){
output[["divisions"]] = c(-Inf, divisionCheck[2],
divisionCheck[2]+1, Inf)
output[["bins"]] = 1
} else {
output[["divisions"]] = unique(divisionCheck)
output[["bins"]] = (length(unique(divisionCheck)) - 3)
}
}
if(is(x, "List")){
x = as.list(x)
}
if(is.list(x)){
nplots = length(x)
} else {
nplots = 1
}
df = cutDists(x, divisions=output[["divisions"]])
if ("name" %in% names(df)){
if (!numbers)
df$Freq = df[, .(Freq.Per = (Freq / sum(Freq)) * 100),
by = name]$"Freq.Per"
g = ggplot(df, aes(x=cuts, y=Freq, fill=name)) +
facet_wrap(. ~name)
} else {
if (!numbers)
df$Freq = df[, .(Freq.Per = (Freq / sum(Freq)) * 100)]$"Freq.Per"
g = ggplot(df, aes(x=cuts, y=Freq))
}
# Create a vector for the colors
colors_vect = c(EndBarColor ,
rep(MiddleBarColor, (length(output[["divisions"]])-3)), EndBarColor)
colors_vect = rep(colors_vect, nplots)
nbars = output[["bins"]]+2
qbaridx = sort(c(seq(1, nbars*nplots, by=nbars),
seq(nbars, nbars*nplots, by=nbars)))
g = g +
geom_bar(stat="identity", fill = colors_vect) +
theme_classic() +
theme(aspect.ratio=1) +
theme_blank_facet_label() +
ylab("Frequency") +
xlab("") +
theme(axis.text.x=element_text(angle = 90, hjust = 1, vjust=0.5)) +
theme(plot.title = element_text(hjust = 0.5)) + # Center title
ggtitle("Quantile Trimmed Histogram") +
theme(legend.position="bottom") +
geom_text(aes(label= paste((output[["quantile"]]*100),"%", sep='')),
data=df[qbaridx,], hjust=-1, angle=90, size=2.5)
if (!numbers){
g = g + ylab("Percentage")
}
return(g)
}
# Internal helper function for \code{plotQTHist}
#
# If the bins or quantiles for the hist are specified by the user, those are
# used. Otherwise, this function is used to calculate 1) number of bins based
# on size of the dataset, and 2) quantiles based on bins.
#
# @param x A vector of GRanges x.
# @return A list of the divisions that will be used in plotting the histogram.
# @examples
# calcDivisions(runif(500)*1000)
calcDivisions = function(x, bins=NULL, quantThresh = NULL){
if(is.list(x)){
x=unlist(x)
}
# calculating bins
if(!is.null(bins)){
b = bins
} else {
n = length(x)
if (n > 10000) {n = 10000}
if (n < 500) {n = 500}
# finding number of bins based on the size of dataset
b = round(n^.15 + (n/200))
}
# calculating quantiles
if(!is.null(quantThresh)){
if(quantThresh > .2){
stop("quantThresh value must be less than .2, Optimal size is under .1") }
q = quantThresh
} else {
# finding the quantile on each side based on number of bins
q = round(25/(b))/100
# minimum on each side is 1%
q = max(.01, q)
}
quant = unname(stats::quantile(x, probs = c((q), (1-(q)))))
seq_10 = seq(quant[1], quant[2], length = b+1)
div = c(-Inf, round(seq_10), Inf)
listOutput <- list("bins"= b,"quantile"= q, "divisions" = div)
return(listOutput)
}