/
coca.R
176 lines (159 loc) · 7.5 KB
/
coca.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
#' Cluster-Of-Clusters Analysis
#'
#' This function allows to do Cluster-Of-Clusters Analysis on a binary matrix
#' where each column is a clustering of the data, each row corresponds to a data
#' point and the element in position (i,j) is equal to 1 if data point i belongs
#' to cluster j, 0 otherwise.
#'
#' @param moc N X C data matrix, where C is the total number of clusters
#' considered.
#' @param K Number of clusters.
#' @param maxK Maximum number of clusters considered for the final clustering if
#' K is not known. Default is 6.
#' @param B Number of iterations of the Consensus Clustering step.
#' @param pItem Proportion of items sampled at each iteration of the Consensus
#' Cluster step.
#' @param hclustMethod Agglomeration method to be used by the hclust function to
#' perform hierarchical clustering on the consensus matrix. Can be "single",
#' "complete", "average", etc. For more details please see ?stats::hclust.
#' @param choiceKmethod Method used to choose the number of clusters if K is
#' NULL, can be either "AUC" (area under the curve, work in progress) or
#' "silhouette". Default is "silhouette".
#' @param ccClMethod Clustering method to be used by the Consensus Clustering
#' algorithm (CC). Can be either "kmeans" for k-means clustering or "hclust" for
#' hiearchical clustering. Default is "kmeans".
#' @param ccDistHC Distance to be used by the hiearchical clustering algorithm
#' inside CC. Can be "pearson" (for 1 - Pearson correlation), "spearman" (for 1-
#' Spearman correlation), or any of the distances provided in stats::dist()
#' (i.e. "euclidean", "maximum", "manhattan", "canberra", "binary" or
#' "minkowski"). Default is "euclidean".
#' @param maxIterKM Number of iterations for the k-means clustering algorithm.
#' Default is 1000.
#' @param verbose Boolean.
#' @param savePNG Boolean. Save plots as PNG files. Default is FALSE.
#' @param fileName If \code{savePNG} is TRUE, this is the string containing (the
#' first part of) the name of the output files. Can be used to specify the
#' folder path too. Default is "coca". The ".png" extension is automatically
#' added to this string.
#' @param widestGap Boolean. If TRUE, compute also widest gap index to choose
#' best number of clusters. Default is FALSE.
#' @param dunns Boolean. If TRUE, compute also Dunn's index to choose best
#' number of clusters. Default is FALSE.
#' @param dunn2s Boolean. If TRUE, compute also alternative Dunn's index to
#' choose best number of clusters. Default is FALSE.
#' @param returnAllMatrices Boolean. If TRUE, return consensus matrices for all
#' considered values of K. Default is FALSE.
#' @return This function returns a list containing:
#' \item{consensusMatrix}{a symmetric matrix where the element in position
#' (i,j) corresponds to the proportion of times that items i and j have been
#' clustered together and a vector of cluster labels.}
#' \item{clusterLabels}{the final cluster labels.}
#' \item{K}{the final number of clusters. If provided by the user, this is
#' the same as the input. Otherwise, this is the number of clusters selected via
#' the requested method (see argument \code{choiceKmethod}).}
#' \item{consensusMatrices}{if returnAllMatrices = TRUE, this array
#' also returned, containing the consensus matrices obtained for each of the
#' numbers of clusters considered by the algorithm.}
#' @author Alessandra Cabassi \email{alessandra.cabassi@mrc-bsu.cam.ac.uk}
#' @references The Cancer Genome Atlas, 2012. Comprehensive molecular
#' portraits of human breast tumours. Nature, 487(7407), pp.61–70.
#' @references Cabassi, A. and Kirk, P. D. W. (2019). Multiple kernel learning
#' for integrative consensus clustering of 'omic datasets. arXiv preprint.
#' arXiv:1904.07701.
#' @examples
#' # Load data
#' data <- list()
#' data[[1]] <- as.matrix(read.csv(system.file("extdata", "dataset1.csv",
#' package = "coca"), row.names = 1))
#' data[[2]] <- as.matrix(read.csv(system.file("extdata", "dataset2.csv",
#' package = "coca"), row.names = 1))
#' data[[3]] <- as.matrix(read.csv(system.file("extdata", "dataset3.csv",
#' package = "coca"), row.names = 1))
#'
#' # Build matrix of clusters
#' outputBuildMOC <- buildMOC(data, M = 3, K = 5, distances = "cor")
#'
#' # Extract matrix of clusters
#' moc <- outputBuildMOC$moc
#'
#' # Do Cluster-Of-Clusters Analysis
#' outputCOCA <- coca(moc, K = 5)
#'
#' # Extract cluster labels
#' clusterLabels <- outputCOCA$clusterLabels
#'
#' @export
coca <-
function(moc,
K = NULL,
maxK = 6,
B = 1000,
pItem = 0.8,
hclustMethod = "average",
choiceKmethod = "silhouette",
ccClMethod = "kmeans",
ccDistHC = "euclidean",
maxIterKM = 1000,
savePNG = FALSE,
fileName = "coca",
verbose = FALSE,
widestGap = FALSE,
dunns = FALSE,
dunn2s = FALSE,
returnAllMatrices = FALSE) {
# Intialise output list
output <- list()
N <- dim(moc)[1]
if (is.null(K) & choiceKmethod == "silhouette") {
consensusMatrix <- array(NA, c(N, N, maxK - 1))
clLabels <- array(NA, c(maxK - 1, N))
for (i in seq_len(maxK - 1) + 1) {
### Step 1. Compute the consensus matrix
consensusMatrix[, , i - 1] <-
consensusCluster(moc, i, B, pItem, clMethod = ccClMethod,
dist = ccDistHC)
### Step 2. Use hierarchical clustering on the consensus matrix
distances <- stats::as.dist(1 - consensusMatrix[, , i - 1])
hClustering <- stats::hclust(distances, method = hclustMethod)
clLabels[i - 1, ] <- stats::cutree(hClustering, i)
}
K <- maximiseSilhouette(consensusMatrix, clLabels, maxK, savePNG,
fileName, widestGap = widestGap, dunns = dunns,
dunn2s = dunn2s)$K
} else if (is.null(K) & choiceKmethod == "AUC") {
consensusMatrix <- array(NA, c(N, N, maxK - 1))
areaUnderTheCurve <- rep(NA, maxK - 1)
for (i in seq_len(maxK - 1) + 1) {
### Step 1. Compute the consensus matrix ###
consensusMatrix[, , i - 1] <-
consensusCluster(moc, i, B, pItem, clMethod = ccClMethod,
dist = ccDistHC, maxIterKM = maxIterKM)
### Step 2. Compute area under the curve ###
areaUnderTheCurve[i - 1] <- computeAUC(consensusMatrix[, , i - 1])
}
K <- chooseKusingAUC(areaUnderTheCurve, savePNG, fileName)$K
} else if (is.null(K)) {
stop("Method to choose number of clusters has not been recognised.
Please make sure that it is either `silhouette` or `AUC`.")
} else {
consensusMatrix <- NULL
}
if (verbose)
print(paste("K =", K, sep = " "))
### Step 1. Compute the consensus matrix ###
if (!is.null(consensusMatrix)) {
output$consensusMatrix <- consensusMatrix[, , K - 1]
} else {
output$consensusMatrix <- consensusCluster(moc, K, B, pItem,
clMethod = ccClMethod,
dist = ccDistHC)
}
### Step 2. Use hierarchical clustering on the consensus matrix ###
distances <- stats::as.dist(1 - output$consensusMatrix)
hClustering <- stats::hclust(distances, method = hclustMethod)
output$clusterLabels <- stats::cutree(hClustering, K)
output$K <- K
if (returnAllMatrices)
output$consensusMatrices <- consensusMatrix
return(output)
}