/
generalize-userfunction.R
435 lines (425 loc) · 19 KB
/
generalize-userfunction.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
#' Create a GenMLVSBM object from observed data
#'
#' @param X A list of \eqn{L} square matrices with binary or count data,
#' ordered from upper to lower level
#' @param A A list of \eqn{L-1} matrices the affiliation matrix, linking X[[l]]
#' and X[[l+1]] with \code{nrow(A[[l]]) == nrow(X[[l+1]])} and
#' \code{ncol(A[[l]]) == nrow(X[[l]])}. Rows may have any number of affiliations
#' but will be rescaled so that each rows sum either to 1 or 0.
#' @param directed A vector of L booleans are the levels
#' directed or not. Default will check if the matrix are symmetric or not.
#' @param distribution A vector of length L, the distribution of X,
#' only "bernoulli" and "poisson" are implemented.
#'
#' @return An unfitted MLVSBM object corresponding to the multilevel network
#' @importFrom stats rbinom
#' @export
#' @examples
#'
#' # A standard binary 2 levels multilevel network
#' ind_adj <- matrix(stats::rbinom(n = 20**2, size = 1, prob = .2),
#' nrow = 20, ncol = 20)
#' diag(ind_adj) <- 0
#' org_adj <- matrix(stats::rbinom(n = 10**2, size = 1, prob = .3),
#' nrow = 10, ncol = 10)
#' diag(org_adj) <- 0
#' affiliation <- matrix(0, 20, 10)
#' affiliation[cbind(seq(20), sample(seq(10), 20, replace = TRUE))] <- 1
#'
#' my_mlvsbm <- mlvsbm_create_generalized_network(X = list(org_adj, ind_adj),
#' directed = c(TRUE, TRUE),
#' A = list(affiliation),
#' distribution = rep("bernoulli", 2))
#'
#' # A 4 levels temporal network with the same nodes and with count data.
#' #All the affiliation matrices are diagonal.
#'
#' X <- replicate(4,
#' matrix(stats::rpois(n = 20*20, lambda = .7), 20, 20),
#' simplify = FALSE)
#' for (m in seq(4)) {diag(X[[m]]) <- 0}
#' A <- replicate(3, diag(1, 20), simplify = FALSE)
#' my_mlvsbm <- mlvsbm_create_generalized_network(X = X,
#' directed = rep(4, TRUE),
#' A = A,
#' distribution = rep("poisson", 4))
mlvsbm_create_generalized_network <-
function(X, A, directed = NULL,
distribution = NULL) {
for (l in seq_along(A)) {
if ( any(! rowSums(A[[l]]) %in% c(0,1))) {
# warning(paste0("All rows of A must have exactly one 1!!!"))
message(paste0("Affiliation has been normalized so that any rows sum to zero or one!"))
A[[l]][rowSums(A[[l]]) != 0,] <- A[[l]][rowSums(A[[l]]) != 0,]/
rowSums(A[[l]][rowSums(A[[l]]) != 0,])
}
if ( any(rowSums(A[[l]]) == 0)) {
message(paste0("Some rows of A have no non 0 entry!!!"))
}
}
for (l in seq_along(X)) {
if (! is.matrix(X[[l]]) |
ncol(X[[l]]) != nrow(X[[l]])) {
stop(paste0("X[[l]] must be a square matrix!!!"))
}
}
for (l in seq_along(A)) {
if (ncol(X[[l+1]]) != nrow(A[[l]]) |
ncol(X[[l]]) != ncol(A[[l]])) {
stop(paste0("A[[l]], X[[l]] and X[[l+1]]'s dimensions are not compatible!!!"))
}
}
if (is.null(directed)) {
directed <- vapply(
X, function(x) ! isSymmetric.matrix(x), FUN.VALUE = TRUE
)
}
new_genmlvsbm <-
GenMLVSBM$new(
n = vapply(X, ncol, FUN.VALUE = 1),
X = X,
A = A,
L = length(X),
directed = directed,
distribution = distribution
)
new_genmlvsbm$min_clusters <- rep(1, length(X))
new_genmlvsbm$max_clusters <- vapply(X, function(x) floor(sqrt(nrow(x))),
FUN.VALUE = 1)
return (new_genmlvsbm)
}
#' Create a simulate a generalized multilevel network (GenMLVSBM object)
#'
#' @param n A vector of L positive integers where L is the number of levels from
#' the upper level to the lower level.
#' @param Q A vector of L positive integers,
#' the number of clusters for each level.
#' @param pi A list of length L, with vectors of probabilities of length Q[l],
#' the mixture parameters. pi[[1]] must be a probability, pi[[l]] can be set to
#' \code{NULL} for a given level if all nodes of this level have an affiliation.
#' @param alpha A list of L matrices, of size \eqn{Q[l] \times Q[l]} matrix
#' giving the connectivity probabilities.
#' @param directed A vector of L logical. Is level l a directed
#' network ?
#' @param gamma A list of size \eqn{L-1} of \eqn{Q[l+1] \times Q[l]} matrix with
#' each column summing to one, the mixture parameters given the affiliation
#' @param affiliation The distribution under which the affiliation matrix is
#' simulated in c("uniform", "preferential", "diagonal").
#' "diagonal" is a special case where all affiliation matrix are diagonal
#' (individuals are the same on each levels, for temporal networks e.g.).
#' It requires \code{n} to be
#' constant.
#' @param no_empty_org A logical with FALSE as default, should
#' every nodes have at least one affiliated node on the level below?
#' Needs to have \eqn{n[l] \geq n[l+1]}.
#' @param distribution A list for the distribution of X,
#' only "bernoulli" is implemented.
#' @param no_isolated_node A logical, if TRUE then the network is simulated
#' again until all nodes are connected.
#'
#' @return An GenMLVSBM object, a simulated multilevel network with levels,
#' affiliations and memberships.
#' @export
#'
#' @examples
#' my_genmlvsbm <- MLVSBM::mlvsbm_simulate_generalized_network(
#' n = c(10, 20), # Number of nodes for each level
#' Q = c(2, 2), # Number of blocks for each level
#' pi = list(c(.3, .7), NULL), # Block proportion for the upper level, must sum to one
#' gamma = list(matrix(c(.9, .2, # Block proportion for the lower level,
#' .1, .8), # each column must sum to one
#' nrow = 2, ncol = 2, byrow = TRUE)),
#' alpha = list(matrix(c(.8, .2,
#' .2, .1),
#' nrow = 2, ncol = 2, byrow = TRUE), # Connection matrix
#' matrix(c(.99, .3,
#' .3, .1),
#' nrow = 2, ncol = 2, byrow = TRUE)),# between blocks
#' directed = c(FALSE, FALSE),
#' distribution = rep("bernoulli", 2)) # Are the upper and lower level directed
mlvsbm_simulate_generalized_network <-
function (n, Q, pi, gamma, alpha,
directed, affiliation = "uniform",
distribution,
no_empty_org = FALSE,
no_isolated_node = FALSE) {
# browser()
if (any(n <= 0)) {
stop(paste0("n must be positive integers!!!"))
}
if(any(! vapply(alpha, is.matrix, TRUE)) |
any(! vapply(gamma, is.matrix, TRUE))) {
stop(paste0("element of gamma and alpha must be matrices!!!"))
}
if (any(pi[[1]] > 1) | any(pi[[1]] < 0) | sum(pi[[1]]) != 1) {
warning(paste0("pi[[1]] is a probability vector,
its coefficients must sum to one!!!"))
}
L <- length(n)
if ( Q[1] != length(pi[[1]]) |
any(vapply(gamma, ncol, 1) != Q[1:(L-1)]) |
any(vapply(gamma, nrow, 1) != Q[2:L]) |
any(vapply(alpha, ncol, 1) != Q) |
any(vapply(alpha, nrow, 1) != Q)) {
stop(paste0("Number of clusters and parameters dimension are
not compatible!!!"))
}
for (l in seq(L-1)) {
if (any(gamma[[l]] > 1) | any(gamma[[l]] < 0) |
any(colSums(gamma[[l]]) != 1)) {
stop(paste0("Any column of gamma must be a probability vector!!!"))
}
}
lapply(seq(L),
function(l) {
if (distribution[l] == "bernoulli") {
if (any(alpha[[l]] > 1) | any(alpha[[l]] < 0)) {
stop(paste0("Any coefficient of alpha[[l]] must be
between 0 and 1!!!"))
}
}
})
# if (! directed[[1]] & !isSymmetric(alpha[[1]])) {
# stop(paste0("alpha[[1]] is not symmetric but level is undirected!!!"))
# }
# if (! directed[[2]] & ! isSymmetric(alpha[[2]])) {
# stop(paste0("alpha[[2]] is not symmetric but level is undirected!!!"))
# }
new_mlvsbm <-
GenMLVSBM$new(n = n,
L = L,
directed = directed,
sim_param = list(alpha = alpha,
gamma = gamma,
pi = pi,
Q = Q,
affiliation = affiliation,
no_empty_org = no_empty_org,
no_isolated_node = no_isolated_node),
distribution = rep("bernoulli", L))
new_mlvsbm$simulate()
return(new_mlvsbm)
}
#' Infer a generalized multilevel network (GenMLVSBM object),
#' the original object is modified
#'
#' @description The inference use a greedy algorithm to navigate between model
#' size. For a given model size, the inference is done via a variational EM
#' algorithm. The returned model is the one with the highest ICL criterion among
#' all visited models.
#'
#' By default the algorithm fits a single level SBM for each level, before
#' inferring the generalized multilevel network. This step can be skipped by
#' specifying an initial clustering with the \code{init_clustering}. Also,
#' a given model size can be force by setting the parameters \code{nb_clusters}
#' to a given value.
#'
#' @param gmlv A GenMLVSBM object, the network to be inferred.
#' @param nb_clusters A vector of L integer, the model sizes.
#' If left to \code{NULL}, the algorithm
#' will navigate freely. Otherwise it will navigate between the specified model
#' size and its neighbors.
#' @param init_clustering A list of L vectors of integers of the same length as
#' the number of node of each level. If specified, the algorithm will start from
#' this clustering, then navigate freely.
#' @param nb_cores An integer, the number of cores to use. Default to \code{1}
#' for Windows and \code{detectCores()/2} for Linux and MacOS
#' @param init_method One of "hierarchical" (the default) or "spectral",
#' "spectral" might be more efficient but can lead to some numeric errors.
#' Not used when int_clustering is given.
#' @param fit_options A named list to be passed to the VE-M inference algorithm.
#'
#' @return A FitGenMLVSBM object, the best inference of the network
#'
#' @details
#' ## fit_options
#' ### ve: Using the default \code{ve = "joint"} will update all the block
#' memberships of all levels at each VE step before performing a M step.
#' Using \code{ve = "sequential"} will update the block memberships of one level
#' at a time before performing a M step only on the concerned parameters. Use
#' this option if the running time of the algorithm is too long and the number
#' of levels is large.
#' ### init_points: Only used when giving no initial clustering and no number of clusters.
#' If "all" will use an initial clustering obtained from doing a spectral
#' clustering with fit_options$Qmax clusters for each level (default to
#' \code{Q = Qmax = celing(log(n))}) in addition to the initial clustering
#' obtained from fitting independent sbm on each level.
#' @importFrom blockmodels BM_bernoulli
#' @export
#' @examples
#' my_genmlvsbm <- MLVSBM::mlvsbm_simulate_generalized_network(
#' n = c(20,20), # Number of nodes for the lower level and the upper level
#' Q = c(2,2), # Number of blocks for the lower level and the upper level
#' pi = list(c(.3, .7),NULL), # Block proportion for the upper level, must sum to one
#' gamma = list(matrix(c(.9, .2, # Block proportion for the lower level,
#' .1, .8), # each column must sum to one
#' nrow = 2, ncol = 2, byrow = TRUE)),
#' alpha = list(matrix(c(.8, .2,
#' .2, .1),
#' nrow = 2, ncol = 2, byrow = TRUE), # Connection matrix
#' matrix(c(.99, .3,
#' .3, .1),
#' nrow = 2, ncol = 2, byrow = TRUE)),# between blocks
#' directed = c(FALSE, FALSE), # Are the upper and lower level directed or not ?
#' affiliation = "preferential",
#' distribution = rep("bernoulli", 2)) # How the affiliation matrix is generated
#' \donttest{fit <- MLVSBM::mlvsbm_estimate_generalized_network(
#' gmlv = my_genmlvsbm, nb_cores = 1)}
#'
#' # A more complex example. A 4 levels network with different structures and
#' # level direction.
#' n <- 100
#' L <- 4
#' alpha <- list(
#' diag(.4, 3, 3) + .1, # Undirected assortative community
#' -diag(.2, 3, 3) + .3, # Undirected disassortative
#' matrix(c(.8, .2, .1, # Undirected core-periphery
#' .4, .4, .1,
#' .2, .1, .1), 3, 3),
#' matrix(c(.3, .5, .5, # Directed mixte structure
#' .1, .4, .5,
#' .1, .3, .1), 3, 3)
#' )
#' gamma <- lapply(seq(3),
#' function(m) matrix(c(.8, .1, .1,
#' .1, .8, .1,
#' .1, .1, .8), 3, 3, byrow = TRUE))
#' pi <- list(rep(1, 3)/3, NULL, c(.1, .3, .6), NULL)
#' directed = c(FALSE, FALSE, FALSE, TRUE)
#' gmlv <- mlvsbm_simulate_generalized_network(n = rep(n, 4),
#' Q = rep(3, 4),
#' pi = pi,
#' gamma = gamma,
#' alpha = alpha,
#' directed = directed,
#' distribution = rep("bernoulli", 4))
#' \dontrun{
#' fit <- mlvsbm_estimate_generalized_network(gmlv,
#' fit_options = list(ve = "joint"))
#' plot(fit)
#' fit2 <- mlvsbm_estimate_generalized_network(gmlv,
#' fit_options = list(ve = "sequential"))
#' fitone <- mlvsbm_estimate_generalized_network(gmlv2, nb_clusters = rep(3, 4),
#' fit_options = list(ve = "sequential"))
#' fit_from_scratch <- mlvsbm_estimate_generalized_network(
#' gmlv,
#' init_clustering = lapply(seq(4), function(x)rep(1, n)),
#' init_method = "merge_split",
#' fit_options = list(ve = "joint"))
#' }
mlvsbm_estimate_generalized_network <-
function(gmlv, nb_clusters = NULL, init_clustering = NULL, nb_cores = NULL,
init_method = "hierarchical",
fit_options = list(ve = "joint", init_points = "all", Qmax = NA)) {
if (! "GenMLVSBM" %in% class(gmlv)) {
stop("Object gmlv must be of class GenMLVSBM,
please use the function mlvsbm_create_network to create one")
}
os <- Sys.info()["sysname"]
if (is.null(nb_cores)) {
if (os != 'Windows') {
nb_cores <-
max(parallel::detectCores(all.tests = FALSE, logical = TRUE) %/% 2, 1)
if (is.na(nb_cores)) nb_cores <- 1
} else {
nb_cores <- 1
}
}
fitopts <- list(ve = "joint", init_points = "all", Qmax = NA)
fit_options <- utils::modifyList(fitopts, fit_options)
gmlv$fit_options <- fit_options
#browser()
if (is.null(nb_clusters)) {
if (is.null(init_clustering)) {
fit_sbm <-
bettermc::mclapply(
X = seq_along(gmlv$adjacency_matrix),
FUN = function(l) {
sbm::estimateSimpleSBM(
model = gmlv$distribution[l],
netMat = gmlv$adjacency_matrix[[l]],
estimOptions = list(verbosity = 0,
plot = FALSE, nbCores = 1L,
exploreMin = gmlv$max_clusters[l]))
}, mc.cores = nb_cores
)
sbm_fit <- lapply(X = seq(length(gmlv$nb_nodes)),
FUN = function(l) {
gmlv$estimate_level(level = l,
Z = fit_sbm[[l]]$memberships,
nb_cores = nb_cores,
init = "merged_split")
})
# sbm_fit <- lapply(X = seq(length(gmlv$nb_nodes)),
# FUN = function(l) {
# gmlv$estimate_level(level = l,
# nb_cores = nb_cores,
# init = init_method)
# })
nb_clusters <- vapply(sbm_fit, function(bm) which.max(bm$ICL),
FUN.VALUE = 1)
init_clustering <-
lapply(seq_along(sbm_fit),
function(m) sbm_fit[[m]]$models[[nb_clusters[m]]]$Z)
fit <- gmlv$estimate_all_bm(Q = nb_clusters,
Z = init_clustering,
nb_cores = nb_cores)
if (gmlv$fit_options$init_points == "all") {
if (is.na(gmlv$fit_options$Qmax))
gmlv$fit_options$Qmax <-
pmax(ceiling(log(gmlv$nb_nodes)), fit$nb_clusters + 2)
Zmax <- lapply(seq_along(gmlv$adjacency_matrix),
function(l) spcClust(gmlv$adjacency_matrix[[l]],
gmlv$fit_options$Qmax[l]))
fit2 <- gmlv$estimate_all_bm(Q = gmlv$fit_options$Qmax,
Z = Zmax,
nb_cores = nb_cores)
if (fit$ICL < fit2$ICL) fit <- fit2
}
ICL_sbm <- sum(vapply(sbm_fit,
function(bm) max(bm$ICL), FUN.VALUE = .1))
print(paste0("ICL for independent levels : ", ICL_sbm))
print(paste0("ICL for interdependent levels : ",
fit$ICL))
if (ICL_sbm <= fit$ICL) {
print("=====Interdependence is detected between levels!=====")
} else {
print("=====The levels of this network are independent!=====")
}
fit$reorder(order = "affiliation")
return(fit)
} else {
nb_clusters <- vapply(init_clustering, max, FUN.VALUE = 1)
fit <- gmlv$estimate_all_bm(Q = nb_clusters,
Z = init_clustering,
nb_cores = nb_cores)
print(paste0("ICL for interdependent levels : ",
fit$ICL))
fit$reorder(order = "affiliation")
return(fit)
}
} else {
if (is.null(init_clustering)) {
fit_sbm <-
bettermc::mclapply(
X = seq_along(gmlv$adjacency_matrix),
FUN = function(l) {
sbm::estimateSimpleSBM(
model = gmlv$distribution[l],
netMat = gmlv$adjacency_matrix[[l]],
estimOptions = list(verbosity = 0,
plot = FALSE, nbCores = 1L,
exploreMin = nb_clusters[l]))
}, mc.cores = nb_cores
)
init_clustering <- lapply(fit_sbm, function(fit) fit$memberships)
}
fit <- gmlv$estimate_one(Q = nb_clusters,
Z = init_clustering,
nb_cores = nb_cores)
print(paste0("ICL for interdependent levels : ",
fit$ICL))
fit$reorder(order = "affiliation")
return(fit)
}
}