/
ggCor.R
221 lines (213 loc) · 7.83 KB
/
ggCor.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
#' Triangular Correlation Plot
#'
#' @description Return a ggplot object to plot a triangular correlation figure
#' between 2 or more variables.
#'
#' @param data A data.frame with numerical columns for each variable to be
#' compared.
#' @param colours A vector of size three with the colors to be used for values
#' -1, 0 and 1.
#' @param blackLabs A numeric vector of size two, with min and max correlation
#' coefficient.
#' @param show_signif Logical scalar. Display significance values ?
#' @param p_breaks Passed to function 'cut'. Either a numeric vector of two or
#' more unique cut points or a single number (greater than or equal to 2) giving
#' the number of intervals into which x is to be cut.
#' @param p_labels Passed to function 'cut'. labels for the levels of the
#' resulting category. By default, labels are constructed using "(a,b]" interval
#' notation. If \code{p_labels = FALSE}, simple integer codes are returned
#' instead of a factor.
#' @param show_diagonal Logical scalar. Display main diagonal values ?
#' @param diag A named vector of labels to display in the main diagonal. The
#' names are used to place each value in the corresponding coordinates of the
#' diagonal. Hence, these names must be the same as the colnames of data.
#' @param return_table Return the table to display instead of a ggplot object.
#' @param return_n Return plot with shared information.
#' @param adjusted Use the adjusted p values for multiple testing instead of
#' raw coeffs. \code{TRUE} by default.
#' @param label_size Numeric value indicating the label size. 3 by default.
#' @param method method="pearson" is the default value.
#' The alternatives to be passed to cor are "spearman" and "kendall".
#' These last two are much slower, particularly for big data sets.
#'
#' @return A ggplot object containing a triangular correlation figure with all
#' numeric variables in data. If return_table is \code{TRUE}, the table used to
#' produce the figure is returned instead.
#' @export
#'
#' @examples
#' library(agriutilities)
#' data(iris)
#' gg_cor(
#' data = iris,
#' colours = c("#db4437", "white", "#4285f4"),
#' label_size = 6
#' )
#' @author Daniel Ariza, Johan Aparicio.
#' @importFrom stats na.omit
gg_cor <- function(data,
colours = c("#db4437", "white", "#4285f4"),
blackLabs = c(-0.7, 0.7),
show_signif = TRUE,
p_breaks = c(0, .001, .01, .05, Inf),
p_labels = c("***", "**", "*", "ns"),
show_diagonal = FALSE,
diag = NULL,
return_table = FALSE,
return_n = FALSE,
adjusted = TRUE,
label_size = 3,
method = "pearson") {
# Drop non numeric columns in the dataset
if (sum(!sapply(data, is.numeric))) {
message(
"Dropping non-numeric columns in the dataset:\n",
paste(names(which(!sapply(data, is.numeric))), collapse = "\t")
)
data <- data[, sapply(data, is.numeric)]
}
# Calculate corr-coeffs and p values
cors <- psych::corr.test(data, use = "pairwise.complete.obs", method = method)
# Use the adjusted p values for multiple testing instead of raw coeffs
if (adjusted) cors$p <- t(cors$p)
# Keep only the matrices with correlation coefficients, p values and N shared
# samples
cors <- cors[c(1, 2, 4)]
# Make sure you have a full matrix of N shared samples
if (is.vector(cors$n)) {
cors$n <- matrix(
data = cors$n,
ncol = ncol(cors$p),
nrow = nrow(cors$p),
dimnames = dimnames(cors$p)
)
}
# For each matrix, do ...
cors <- lapply(cors, function(x) {
# Keep the upper triangle of the matrix
x[upper.tri(x)] <- NA
# Transpose the matrix to plot the lower triangle
x <- as.data.frame(t(x))
# Reshape the matrix to tidy format
x[, "col"] <- colnames(x)
x <- tidyr::gather(data = x, key = "row", value = "value", -col)
colnames(x) <- c("col", "row", "value")
# Round coefficients
x$name <- round(x$value, 2)
# Sort the x axis according to data column order
x$col <- factor(x$col, levels = colnames(data))
# Reverse the y axis for a triangle plot from top-left to bottom-right
x$row <- factor(x$row, levels = rev(colnames(data)))
# Remove NAs
x <- na.omit(x)
})
# Combine both dataframes with p values and corr coefficients
cors <- merge(
x = merge(x = cors$r, y = cors$p, by = c("col", "row")),
y = cors$n,
by = c("col", "row")
)
# Keep x, y, p val and corr-coefficients columns
cors <- cors[, c(1, 2, 4, 5, 7)]
if (return_n) {
if (return_table) {
return(cors)
}
cors$cols <- scale(cors$value, center = TRUE, scale = TRUE)
cors$cols <- ifelse(abs(cors$cols) < 2, "black", "white")
p <- ggplot2::ggplot(
data = cors,
ggplot2::aes(x = col, y = row, fill = value)
) +
ggplot2::geom_tile(color = "gray") +
ggplot2::labs(x = NULL, y = NULL) +
ggplot2::theme_minimal(base_size = 13) +
ggplot2::geom_text(
ggplot2::aes(x = col, y = row, label = value),
color = cors$cols,
size = label_size
) +
ggplot2::scale_fill_gradient(
low = colours[2],
high = colours[3],
limits = c(0, max(cors$value))
) +
ggplot2::theme(
axis.text.x = ggplot2::element_text(angle = 40, hjust = 1),
legend.position = "none",
panel.grid.minor.x = ggplot2::element_blank(),
panel.grid.major = ggplot2::element_blank()
)
return(p)
}
if (show_signif) {
# Create a categorical variable for p values as defined by p_breaks
cors$signi <- cut(
x = cors$value.y, right = FALSE,
breaks = p_breaks, labels = p_labels
)
# Join corr-coeff and p-value to display it as a label for each tile
cors$label <- paste(cors$name.x, cors$sign, sep = "\n")
} else {
# The label for each tile is the corr-coeff only
cors$label <- cors$name.x
}
# If there are user-specified values to display in the diagonal
if (!is.null(diag)) {
# Check the names in diag are the same than colnames of data
if (sum(!names(diag) %in% colnames(data))) {
warning(
"These elements in 'diag' do not correspond to column names in
'data':\n",
paste(names(diag)[!names(diag) %in% colnames(data)],
collapse = "\t"
)
)
}
# The tiles of the diagonal are gray
cors[cors$col == cors$row, "name.x"] <- NA
# Get the name of x and y levels
d <- as.character(cors[cors$col == cors$row, "row"])
# Modify the elements of the diagonal and make sure they are displayed
cors[cors$col == cors$row, "label"] <- diag[d]
show_diagonal <- TRUE
}
# Remove the elements of the main diagonal if you don't want to display
if (!show_diagonal) cors <- cors[cors$col != cors$row, ]
# Show darker tiles with white labels for clarity
cors$txtCol <- ifelse(
test = cors$name.x > blackLabs[1] & cors$name.x < blackLabs[2],
yes = "black",
no = "white"
)
# Do not show tile labels for empty tiles.
# Make tile labels of the diagonal white
cors$txtCol[is.na(cors$txtCol)] <- "white"
if (return_table) {
return(cors)
}
p <- ggplot2::ggplot(
data = cors,
ggplot2::aes(x = col, y = row, fill = name.x)
) +
ggplot2::geom_tile(color = "gray") +
ggplot2::labs(x = NULL, y = NULL) +
ggplot2::theme_minimal(base_size = 13) +
ggplot2::geom_text(
ggplot2::aes(x = col, y = row, label = label),
color = cors$txtCol,
size = label_size
) +
ggplot2::scale_fill_gradient2(
low = colours[1],
mid = colours[2],
high = colours[3]
) +
ggplot2::theme(
axis.text.x = ggplot2::element_text(angle = 40, hjust = 1),
legend.position = "none",
panel.grid.minor.x = ggplot2::element_blank(),
panel.grid.major = ggplot2::element_blank()
)
return(p)
}