forked from aestrivex/bctpy
/
clustering.py
676 lines (562 loc) · 22 KB
/
clustering.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
"""
Metrics which group nodes within graphs into clusters.
"""
from __future__ import division, print_function
import numpy as np
from ..modularity import modularity_louvain_und_sign
from ..distance import breadthdist
from ..utils import cuberoot, dummyvar, binarize
from ..due import due, BibTeX
def agreement(ci, buffsz=1000):
"""
Takes as input a set of vertex partitions CI of
dimensions [vertex x partition]. Each column in CI contains the
assignments of each vertex to a class/community/module. This function
aggregates the partitions in CI into a square [vertex x vertex]
agreement matrix D, whose elements indicate the number of times any two
vertices were assigned to the same class.
In the case that the number of nodes and partitions in CI is large
(greater than ~1000 nodes or greater than ~1000 partitions), the script
can be made faster by computing D in pieces. The optional input BUFFSZ
determines the size of each piece. Trial and error has found that
BUFFSZ ~ 150 works well.
Parameters
----------
ci : NxM :obj:`numpy.ndarray`
set of M (possibly degenerate) partitions of N nodes
buffsz : int | None
sets buffer size. If not specified, defaults to 1000
Returns
-------
D : NxN :obj:`numpy.ndarray`
agreement matrix
"""
ci = np.array(ci)
n_nodes, n_partitions = ci.shape
if n_partitions <= buffsz: # Case 1: Use all partitions at once
ind = dummyvar(ci)
D = np.dot(ind, ind.T)
else: # Case 2: Add together results from subsets of partitions
a = np.arange(0, n_partitions, buffsz)
b = np.arange(buffsz, n_partitions, buffsz)
if len(a) != len(b):
b = np.append(b, n_partitions)
D = np.zeros((n_nodes, n_nodes))
for i, j in zip(a, b):
y = ci[:, i:j]
ind = dummyvar(y)
D += np.dot(ind, ind.T)
np.fill_diagonal(D, 0)
return D
def agreement_weighted(ci, wts):
"""
D = AGREEMENT_WEIGHTED(CI,WTS) is identical to AGREEMENT, with the
exception that each partitions contribution is weighted according to
the corresponding scalar value stored in the vector WTS. As an example,
suppose CI contained partitions obtained using some heuristic for
maximizing modularity. A possible choice for WTS might be the Q metric
(Newman's modularity score). Such a choice would add more weight to
higher modularity partitions.
NOTE: Unlike AGREEMENT, this script does not have the input argument
BUFFSZ.
Parameters
----------
ci : MxN :obj:`numpy.ndarray`
set of M (possibly degenerate) partitions of N nodes
wts : Mx1 :obj:`numpy.ndarray`
relative weight of each partition
Returns
-------
D : NxN :obj:`numpy.ndarray`
weighted agreement matrix
"""
ci = np.array(ci)
m, n = ci.shape
wts = np.array(wts) / np.sum(wts)
D = np.zeros((n, n))
for i in range(m):
d = dummyvar(ci[i, :].reshape(1, n))
D += np.dot(d, d.T) * wts[i]
return D
def clustering_coef_bd(A):
"""
The clustering coefficient is the fraction of triangles around a node
(equiv. the fraction of nodes neighbors that are neighbors of each other).
Parameters
----------
A : NxN :obj:`numpy.ndarray`
binary directed connection matrix
Returns
-------
C : Nx1 :obj:`numpy.ndarray`
clustering coefficient vector
Notes
-----
Methodological note: In directed graphs, 3 nodes generate up to 8
triangles (2*2*2 edges). The number of existing triangles is the main
diagonal of S^3/2. The number of all (in or out) neighbour pairs is
K(K-1)/2. Each neighbour pair may generate two triangles. "False pairs"
are i<->j edge pairs (these do not generate triangles). The number of
false pairs is the main diagonal of A^2.
Thus the maximum possible number of triangles =
= (2 edges)*([ALL PAIRS] - [FALSE PAIRS])
= 2 * (K(K-1)/2 - diag(A^2))
= K(K-1) - 2(diag(A^2))
"""
S = A + A.T # symmetrized input graph
K = np.sum(S, axis=1) # total degree (in+out)
cyc3 = np.diag(np.dot(S, np.dot(S, S))) / 2 # number of 3-cycles
K[np.where(cyc3 == 0)] = np.inf # if no 3-cycles exist, make C=0
# number of all possible 3 cycles
CYC3 = K * (K - 1) - 2 * np.diag(np.dot(A, A))
C = cyc3 / CYC3
return C
def clustering_coef_bu(G):
"""
The clustering coefficient is the fraction of triangles around a node
(equiv. the fraction of nodes neighbors that are neighbors of each other).
Parameters
----------
A : NxN :obj:`numpy.ndarray`
binary undirected connection matrix
Returns
-------
C : Nx1 :obj:`numpy.ndarray`
clustering coefficient vector
"""
n = len(G)
C = np.zeros((n,))
for u in range(n):
V, = np.where(G[u, :])
k = len(V)
if k >= 2: # degree must be at least 2
S = G[np.ix_(V, V)]
C[u] = np.sum(S) / (k * k - k)
return C
def clustering_coef_wd(W):
"""
The weighted clustering coefficient is the average "intensity" of
triangles around a node.
Parameters
----------
W : NxN :obj:`numpy.ndarray`
weighted directed connection matrix
Returns
-------
C : Nx1 :obj:`numpy.ndarray`
clustering coefficient vector
Notes
-----
Methodological note (also see clustering_coef_bd)
The weighted modification is as follows:
- The numerator: adjacency matrix is replaced with weights matrix ^ 1/3
- The denominator: no changes from the binary version
The above reduces to symmetric and/or binary versions of the clustering
coefficient for respective graphs.
"""
A = np.logical_not(W == 0).astype(float) # adjacency matrix
S = cuberoot(W) + cuberoot(W.T) # symmetrized weights matrix ^1/3
K = np.sum(A + A.T, axis=1) # total degree (in+out)
cyc3 = np.diag(np.dot(S, np.dot(S, S))) / 2 # number of 3-cycles
K[np.where(cyc3 == 0)] = np.inf # if no 3-cycles exist, make C=0
# number of all possible 3 cycles
CYC3 = K * (K - 1) - 2 * np.diag(np.dot(A, A))
C = cyc3 / CYC3 # clustering coefficient
return C
def clustering_coef_wu(W):
"""
The weighted clustering coefficient is the average "intensity" of
triangles around a node.
Parameters
----------
W : NxN :obj:`numpy.ndarray`
weighted undirected connection matrix
Returns
-------
C : Nx1 :obj:`numpy.ndarray`
clustering coefficient vector
"""
K = np.array(np.sum(np.logical_not(W == 0), axis=1), dtype=float)
ws = cuberoot(W)
cyc3 = np.diag(np.dot(ws, np.dot(ws, ws)))
K[np.where(cyc3 == 0)] = np.inf # if no 3-cycles exist, set C=0
C = cyc3 / (K * (K - 1))
return C
def clustering_coef_wu_sign(W, coef_type='default'):
"""
Returns the weighted clustering coefficient generalized or separated
for positive and negative weights.
Three Algorithms are supported; herefore referred to as default, zhang,
and constantini.
1. Default (Onnela et al.), as in the traditional clustering coefficient
computation. Computed separately for positive and negative weights.
2. Zhang & Horvath. Similar to Onnela formula except weight information
incorporated in denominator. Reduces sensitivity of the measure to
weights directly connected to the node of interest. Computed
separately for positive and negative weights.
3. Constantini & Perugini generalization of Zhang & Horvath formula.
Takes both positive and negative weights into account simultaneously.
Particularly sensitive to non-redundancy in path information based on
sign. Returns only one value.
Parameters
----------
W : NxN :obj:`numpy.ndarray`
weighted undirected connection matrix
corr_type : {'default', 'zhang', 'constantini'}
Allowed values are 'default', 'zhang', 'constantini'
Returns
-------
Cpos : Nx1 :obj:`numpy.ndarray`
Clustering coefficient vector for positive weights
Cneg : Nx1 :obj:`numpy.ndarray`
Clustering coefficient vector for negative weights, unless
coef_type == 'constantini'.
References:
Onnela et al. (2005) Phys Rev E 71:065103
Zhang & Horvath (2005) Stat Appl Genet Mol Biol 41:1544-6115
Costantini & Perugini (2014) PLOS ONE 9:e88669
"""
n = len(W)
np.fill_diagonal(W, 0)
if coef_type == 'default':
W_pos = W * (W > 0)
K_pos = np.array(np.sum(np.logical_not(W_pos == 0), axis=1),
dtype=float)
ws_pos = cuberoot(W_pos)
cyc3_pos = np.diag(np.dot(ws_pos, np.dot(ws_pos, ws_pos)))
K_pos[np.where(cyc3_pos == 0)] = np.inf
C_pos = cyc3_pos / (K_pos * (K_pos - 1))
W_neg = -W * (W < 0)
K_neg = np.array(np.sum(np.logical_not(W_neg == 0), axis=1),
dtype=float)
ws_neg = cuberoot(W_neg)
cyc3_neg = np.diag(np.dot(ws_neg, np.dot(ws_neg, ws_neg)))
K_neg[np.where(cyc3_neg == 0)] = np.inf
C_neg = cyc3_neg / (K_neg * (K_neg - 1))
return C_pos, C_neg
elif coef_type in ('zhang', 'Zhang'):
W_pos = W * (W > 0)
cyc3_pos = np.zeros((n,))
cyc2_pos = np.zeros((n,))
W_neg = -W * (W < 0)
cyc3_neg = np.zeros((n,))
cyc2_neg = np.zeros((n,))
for i in range(n):
for j in range(n):
for q in range(n):
cyc3_pos[i] += W_pos[j, i] * W_pos[i, q] * W_pos[j, q]
cyc3_neg[i] += W_neg[j, i] * W_neg[i, q] * W_neg[j, q]
if j != q:
cyc2_pos[i] += W_pos[j, i] * W_pos[i, q]
cyc2_neg[i] += W_neg[j, i] * W_neg[i, q]
cyc2_pos[np.where(cyc3_pos == 0)] = np.inf
C_pos = cyc3_pos / cyc2_pos
cyc2_neg[np.where(cyc3_neg == 0)] = np.inf
C_neg = cyc3_neg / cyc2_neg
return C_pos, C_neg
elif coef_type in ('constantini', 'Constantini'):
cyc3 = np.zeros((n,))
cyc2 = np.zeros((n,))
for i in range(n):
for j in range(n):
for q in range(n):
cyc3[i] += W[j, i] * W[i, q] * W[j, q]
if j != q:
cyc2[i] += W[j, i] * W[i, q]
cyc2[np.where(cyc3 == 0)] = np.inf
C = cyc3 / cyc2
return C
def consensus_und(D, tau, reps=1000):
"""
This algorithm seeks a consensus partition of the
agreement matrix D. The algorithm used here is almost identical to the
one introduced in Lancichinetti & Fortunato (2012): The agreement
matrix D is thresholded at a level TAU to remove an weak elements. The
resulting matrix is then partitions REPS number of times using the
Louvain algorithm (in principle, any clustering algorithm that can
handle weighted matrixes is a suitable alternative to the Louvain
algorithm and can be substituted in its place). This clustering
produces a set of partitions from which a new agreement is built. If
the partitions have not converged to a single representative partition,
the above process repeats itself, starting with the newly built
agreement matrix.
NOTE: In this implementation, the elements of the agreement matrix must
be converted into probabilities.
NOTE: This implementation is slightly different from the original
algorithm proposed by Lanchichinetti & Fortunato. In its original
version, if the thresholding produces singleton communities, those
nodes are reconnected to the network. Here, we leave any singleton
communities disconnected.
Parameters
----------
D : NxN :obj:`numpy.ndarray`
agreement matrix with entries between 0 and 1 denoting the probability
of finding node i in the same cluster as node j
tau : float
threshold which controls the resolution of the reclustering
reps : int
number of times the clustering algorithm is reapplied. default value
is 1000.
Returns
-------
ciu : Nx1 :obj:`numpy.ndarray`
consensus partition
"""
def unique_partitions(cis):
# relabels the partitions to recognize different numbers on same
# topology
n, r = np.shape(cis) # ci represents one vector for each rep
ci_tmp = np.zeros(n)
for i in range(r):
for j, u in enumerate(sorted(
np.unique(cis[:, i], return_index=True)[1])):
ci_tmp[np.where(cis[:, i] == cis[u, i])] = j
cis[:, i] = ci_tmp
# so far no partitions have been deleted from ci
# now squash any of the partitions that are completely identical
# do not delete them from ci which needs to stay same size, so make
# copy
ciu = []
cis = cis.copy()
c = np.arange(r)
# count=0
while (c != 0).sum() > 0:
ciu.append(cis[:, 0])
dup = np.where(np.sum(np.abs(cis.T - cis[:, 0]), axis=1) == 0)
cis = np.delete(cis, dup, axis=1)
c = np.delete(c, dup)
return np.transpose(ciu)
n = len(D)
flag = True
while flag:
flag = False
dt = D * (D >= tau)
np.fill_diagonal(dt, 0)
if np.size(np.where(dt == 0)) == 0:
ciu = np.arange(1, n + 1)
else:
cis = np.zeros((n, reps))
for i in np.arange(reps):
cis[:, i], _ = modularity_louvain_und_sign(dt)
ciu = unique_partitions(cis)
nu = np.size(ciu, axis=1)
if nu > 1:
flag = True
D = agreement(cis) / reps
return np.squeeze(ciu + 1)
def get_components(A, no_depend=False):
"""
Returns the components of an undirected graph specified by the binary and
undirected adjacency matrix adj. Components and their constitutent nodes
are assigned the same index and stored in the vector, comps. The vector,
comp_sizes, contains the number of nodes beloning to each component.
Parameters
----------
A : NxN :obj:`numpy.ndarray`
binary undirected adjacency matrix
no_depend : Any
Does nothing, included for backwards compatibility
Returns
-------
comps : Nx1 :obj:`numpy.ndarray`
vector of component assignments for each node
comp_sizes : Mx1 :obj:`numpy.ndarray`
vector of component sizes
Notes
-----
Note: disconnected nodes will appear as components with a component
size of 1
Note: The identity of each component (i.e. its numerical value in the
result) is not guaranteed to be identical the value returned in BCT,
matlab code, although the component topology is.
Many thanks to Nick Cullen for providing this implementation
"""
if not np.all(A == A.T): # ensure matrix is undirected
raise ValueError('get_components requires an undirected matrix')
A = binarize(A, copy=True)
n = len(A)
np.fill_diagonal(A, 1)
edge_map = [{u, v} for u in range(n) for v in range(n) if A[u, v] == 1]
union_sets = []
for item in edge_map:
temp = []
for s in union_sets:
if not s.isdisjoint(item):
item = s.union(item)
else:
temp.append(s)
temp.append(item)
union_sets = temp
comps = np.array([i+1 for v in range(n) for i in
range(len(union_sets)) if v in union_sets[i]])
comp_sizes = np.array([len(s) for s in union_sets])
return comps, comp_sizes
def get_components_old(A, no_depend=False):
"""
Returns the components of an undirected graph specified by the binary and
undirected adjacency matrix adj. Components and their constitutent nodes
are assigned the same index and stored in the vector, comps. The vector,
comp_sizes, contains the number of nodes beloning to each component.
Parameters
----------
adj : NxN :obj:`numpy.ndarray`
binary undirected adjacency matrix
no_depend : bool
If true, doesn't import networkx to do the calculation. Default value
is false.
Returns
-------
comps : Nx1 :obj:`numpy.ndarray`
vector of component assignments for each node
comp_sizes : Mx1 :obj:`numpy.ndarray`
vector of component sizes
Notes
-----
Note: disconnected nodes will appear as components with a component
size of 1
Note: The identity of each component (i.e. its numerical value in the
result) is not guaranteed to be identical the value returned in BCT,
although the component topology is.
Note: networkx is used to do the computation efficiently. If networkx is
not available a breadth-first search that does not depend on networkx is
used instead, but this is less efficient. The corresponding BCT function
does the computation by computing the Dulmage-Mendelsohn decomposition. I
don't know what a Dulmage-Mendelsohn decomposition is and there doesn't
appear to be a python equivalent. If you think of a way to implement this
better, let me know.
"""
# nonsquare matrices cannot be symmetric; no need to check
if not np.all(A == A.T): # ensure matrix is undirected
raise ValueError('get_components requires an undirected matrix')
A = binarize(A, copy=True)
n = len(A)
np.fill_diagonal(A, 1)
try:
if no_depend:
raise ImportError()
else:
import networkx as nx
net = nx.from_numpy_matrix(A)
cpts = list(nx.connected_components(net))
cptvec = np.zeros((n,))
cptsizes = np.zeros(len(cpts))
for i, cpt in enumerate(cpts):
cptsizes[i] = len(cpt)
for node in cpt:
cptvec[node] = i + 1
except ImportError:
# if networkx is not available use less efficient breadth first search
cptvec = np.zeros((n,))
r, _ = breadthdist(A)
for node, reach in enumerate(r):
if cptvec[node] > 0:
continue
else:
cptvec[np.where(reach)] = np.max(cptvec) + 1
cptsizes = np.zeros(np.max(cptvec))
for i in np.arange(np.max(cptvec)):
cptsizes[i] = np.size(np.where(cptvec == i + 1))
return cptvec, cptsizes
def number_of_components(A):
_, csizes = get_components(A)
return len(csizes)
@due.dcite(BibTeX("""
@article{luce1949method,
title={A method of matrix analysis of group structure},
author={Luce, R Duncan and Perry, Albert D},
journal={Psychometrika},
volume={14},
number={2},
pages={95--116},
year={1949},
publisher={Springer}
}"""), description='Introduces transitivity for binary graphs.')
def transitivity_bd(A):
"""
Transitivity is the ratio of 'triangles to triplets' in the network.
(A classical version of the clustering coefficient).
Parameters
----------
A : NxN :obj:`numpy.ndarray`
binary directed connection matrix
Returns
-------
T : float
transitivity scalar
Notes
-----
Methodological note: In directed graphs, 3 nodes generate up to 8
triangles (2*2*2 edges). The number of existing triangles is the main
diagonal of S^3/2. The number of all (in or out) neighbour pairs is
K(K-1)/2. Each neighbour pair may generate two triangles. "False pairs"
are i<->j edge pairs (these do not generate triangles). The number of
false pairs is the main diagonal of A^2. Thus the maximum possible
number of triangles = (2 edges)*([ALL PAIRS] - [FALSE PAIRS])
= 2 * (K(K-1)/2 - diag(A^2))
= K(K-1) - 2(diag(A^2))
"""
S = A + A.T # symmetrized input graph
K = np.sum(S, axis=1) # total degree (in+out)
cyc3 = np.diag(np.dot(S, np.dot(S, S))) / 2 # number of 3-cycles
CYC3 = K * (K - 1) - 2 * np.diag(np.dot(A, A)) # number of all 3-cycles
return np.sum(cyc3) / np.sum(CYC3)
def transitivity_bu(A):
"""
Transitivity is the ratio of 'triangles to triplets' in the network.
(A classical version of the clustering coefficient).
Parameters
----------
A : NxN :obj:`numpy.ndarray`
binary undirected connection matrix
Returns
-------
T : float
transitivity scalar
"""
tri3 = np.trace(np.dot(A, np.dot(A, A)))
tri2 = np.sum(np.dot(A, A)) - np.trace(np.dot(A, A))
return tri3 / tri2
def transitivity_wd(W):
"""
Transitivity is the ratio of 'triangles to triplets' in the network.
(A classical version of the clustering coefficient).
Parameters
----------
W : NxN :obj:`numpy.ndarray`
weighted directed connection matrix
Returns
-------
T : int
transitivity scalar
Methodological note (also see note for clustering_coef_bd)
The weighted modification is as follows:
- The numerator: adjacency matrix is replaced with weights matrix ^ 1/3
- The denominator: no changes from the binary version
The above reduces to symmetric and/or binary versions of the clustering
coefficient for respective graphs.
"""
A = np.logical_not(W == 0).astype(float) # adjacency matrix
S = cuberoot(W) + cuberoot(W.T) # symmetrized weights matrix ^1/3
K = np.sum(A + A.T, axis=1) # total degree (in+out)
cyc3 = np.diag(np.dot(S, np.dot(S, S))) / 2 # number of 3-cycles
K[np.where(cyc3 == 0)] = np.inf # if no 3-cycles exist, make T=0
# number of all possible 3-cycles
CYC3 = K * (K - 1) - 2 * np.diag(np.dot(A, A))
return np.sum(cyc3) / np.sum(CYC3) # transitivity
def transitivity_wu(W):
"""
Transitivity is the ratio of 'triangles to triplets' in the network.
(A classical version of the clustering coefficient).
Parameters
----------
W : NxN :obj:`numpy.ndarray`
weighted undirected connection matrix
Returns
-------
T : int
transitivity scalar
"""
K = np.sum(np.logical_not(W == 0), axis=1)
ws = cuberoot(W)
cyc3 = np.diag(np.dot(ws, np.dot(ws, ws)))
return np.sum(cyc3, axis=0) / np.sum(K * (K - 1), axis=0)