-
Notifications
You must be signed in to change notification settings - Fork 267
/
_mantel.py
478 lines (386 loc) · 18.6 KB
/
_mantel.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
# ----------------------------------------------------------------------------
# Copyright (c) 2013--, scikit-bio development team.
#
# Distributed under the terms of the Modified BSD License.
#
# The full license is in the file COPYING.txt, distributed with this software.
# ----------------------------------------------------------------------------
from itertools import combinations
import numpy as np
import pandas as pd
import scipy.misc
from scipy.stats import pearsonr, spearmanr
from skbio.stats.distance import DistanceMatrix
from skbio.util._decorator import experimental
@experimental(as_of="0.4.0")
def mantel(x, y, method='pearson', permutations=999, alternative='two-sided',
strict=True, lookup=None):
"""Compute correlation between distance matrices using the Mantel test.
The Mantel test compares two distance matrices by computing the correlation
between the distances in the lower (or upper) triangular portions of the
symmetric distance matrices. Correlation can be computed using Pearson's
product-moment correlation coefficient or Spearman's rank correlation
coefficient.
As defined in [1]_, the Mantel test computes a test statistic :math:`r_M`
given two symmetric distance matrices :math:`D_X` and :math:`D_Y`.
:math:`r_M` is defined as
.. math::
r_M=\\frac{1}{d-1}\\sum_{i=1}^{n-1}\\sum_{j=i+1}^{n}
stand(D_X)_{ij}stand(D_Y)_{ij}
where
.. math::
d=\\frac{n(n-1)}{2}
and :math:`n` is the number of rows/columns in each of the distance
matrices. :math:`stand(D_X)` and :math:`stand(D_Y)` are distance matrices
with their upper triangles containing standardized distances. Note that
since :math:`D_X` and :math:`D_Y` are symmetric, the lower triangular
portions of the matrices could equivalently have been used instead of the
upper triangular portions (the current function behaves in this manner).
If ``method='spearman'``, the above equation operates on ranked distances
instead of the original distances.
Statistical significance is assessed via a permutation test. The rows and
columns of the first distance matrix (`x`) are randomly permuted a
number of times (controlled via `permutations`). A correlation coefficient
is computed for each permutation and the p-value is the proportion of
permuted correlation coefficients that are equal to or more extreme
than the original (unpermuted) correlation coefficient. Whether a permuted
correlation coefficient is "more extreme" than the original correlation
coefficient depends on the alternative hypothesis (controlled via
`alternative`).
Parameters
----------
x, y : DistanceMatrix or array_like
Input distance matrices to compare. If `x` and `y` are both
``DistanceMatrix`` instances, they will be reordered based on matching
IDs (see `strict` and `lookup` below for handling matching/mismatching
IDs); thus they are not required to be in the same ID order. If `x` and
`y` are ``array_like``, no reordering is applied and both matrices must
have the same shape. In either case, `x` and `y` must be at least 3x3
in size *after* reordering and matching of IDs.
method : {'pearson', 'spearman'}
Method used to compute the correlation between distance matrices.
permutations : int, optional
Number of times to randomly permute `x` when assessing statistical
significance. Must be greater than or equal to zero. If zero,
statistical significance calculations will be skipped and the p-value
will be ``np.nan``.
alternative : {'two-sided', 'greater', 'less'}
Alternative hypothesis to use when calculating statistical
significance. The default ``'two-sided'`` alternative hypothesis
calculates the proportion of permuted correlation coefficients whose
magnitude (i.e. after taking the absolute value) is greater than or
equal to the absolute value of the original correlation coefficient.
``'greater'`` calculates the proportion of permuted coefficients that
are greater than or equal to the original coefficient. ``'less'``
calculates the proportion of permuted coefficients that are less than
or equal to the original coefficient.
strict : bool, optional
If ``True``, raises a ``ValueError`` if IDs are found that do not exist
in both distance matrices. If ``False``, any nonmatching IDs are
discarded before running the test. See `n` (in Returns section below)
for the number of matching IDs that were used in the test. This
parameter is ignored if `x` and `y` are ``array_like``.
lookup : dict, optional
Maps each ID in the distance matrices to a new ID. Used to match up IDs
across distance matrices prior to running the Mantel test. If the IDs
already match between the distance matrices, this parameter is not
necessary. This parameter is disallowed if `x` and `y` are
``array_like``.
Returns
-------
corr_coeff : float
Correlation coefficient of the test (depends on `method`).
p_value : float
p-value of the test.
n : int
Number of rows/columns in each of the distance matrices, after any
reordering/matching of IDs. If ``strict=False``, nonmatching IDs may
have been discarded from one or both of the distance matrices prior to
running the Mantel test, so this value may be important as it indicates
the *actual* size of the matrices that were compared.
Raises
------
ValueError
If `x` and `y` are not at least 3x3 in size after reordering/matching
of IDs, or an invalid `method`, number of `permutations`, or
`alternative` are provided.
TypeError
If `x` and `y` are not both ``DistanceMatrix`` instances or
``array_like``.
See Also
--------
DistanceMatrix
scipy.stats.pearsonr
scipy.stats.spearmanr
pwmantel
Notes
-----
The Mantel test was first described in [2]_. The general algorithm and
interface are similar to ``vegan::mantel``, available in R's vegan
package [3]_.
``np.nan`` will be returned for the p-value if `permutations` is zero or if
the correlation coefficient is ``np.nan``. The correlation coefficient will
be ``np.nan`` if one or both of the inputs does not have any variation
(i.e. the distances are all constant) and ``method='spearman'``.
References
----------
.. [1] Legendre, P. and Legendre, L. (2012) Numerical Ecology. 3rd English
Edition. Elsevier.
.. [2] Mantel, N. (1967). "The detection of disease clustering and a
generalized regression approach". Cancer Research 27 (2): 209-220. PMID
6018555.
.. [3] http://cran.r-project.org/web/packages/vegan/index.html
Examples
--------
Import the functionality we'll use in the following examples:
>>> from skbio import DistanceMatrix
>>> from skbio.stats.distance import mantel
Define two 3x3 distance matrices:
>>> x = DistanceMatrix([[0, 1, 2],
... [1, 0, 3],
... [2, 3, 0]])
>>> y = DistanceMatrix([[0, 2, 7],
... [2, 0, 6],
... [7, 6, 0]])
Compute the Pearson correlation between them and assess significance using
a two-sided test with 999 permutations:
>>> coeff, p_value, n = mantel(x, y)
>>> print(round(coeff, 4))
0.7559
Thus, we see a moderate-to-strong positive correlation (:math:`r_M=0.7559`)
between the two matrices.
In the previous example, the distance matrices (``x`` and ``y``) have the
same IDs, in the same order:
>>> x.ids
('0', '1', '2')
>>> y.ids
('0', '1', '2')
If necessary, ``mantel`` will reorder the distance matrices prior to
running the test. The function also supports a ``lookup`` dictionary that
maps distance matrix IDs to new IDs, providing a way to match IDs between
distance matrices prior to running the Mantel test.
For example, let's reassign the distance matrices' IDs so that there are no
matching IDs between them:
>>> x.ids = ('a', 'b', 'c')
>>> y.ids = ('d', 'e', 'f')
If we rerun ``mantel``, we get the following error notifying us that there
are nonmatching IDs (this is the default behavior with ``strict=True``):
>>> mantel(x, y)
Traceback (most recent call last):
...
ValueError: IDs exist that are not in both distance matrices.
If we pass ``strict=False`` to ignore/discard nonmatching IDs, we see that
no matches exist between `x` and `y`, so the Mantel test still cannot be
run:
>>> mantel(x, y, strict=False)
Traceback (most recent call last):
...
ValueError: No matching IDs exist between the distance matrices.
To work around this, we can define a ``lookup`` dictionary to specify how
the IDs should be matched between distance matrices:
>>> lookup = {'a': 'A', 'b': 'B', 'c': 'C',
... 'd': 'A', 'e': 'B', 'f': 'C'}
``lookup`` maps each ID to ``'A'``, ``'B'``, or ``'C'``. If we rerun
``mantel`` with ``lookup``, we get the same results as the original
example where all distance matrix IDs matched:
>>> coeff, p_value, n = mantel(x, y, lookup=lookup)
>>> print(round(coeff, 4))
0.7559
``mantel`` also accepts input that is ``array_like``. For example, if we
redefine `x` and `y` as nested Python lists instead of ``DistanceMatrix``
instances, we obtain the same result:
>>> x = [[0, 1, 2],
... [1, 0, 3],
... [2, 3, 0]]
>>> y = [[0, 2, 7],
... [2, 0, 6],
... [7, 6, 0]]
>>> coeff, p_value, n = mantel(x, y)
>>> print(round(coeff, 4))
0.7559
It is import to note that reordering/matching of IDs (and hence the
``strict`` and ``lookup`` parameters) do not apply when input is
``array_like`` because there is no notion of IDs.
"""
if method == 'pearson':
corr_func = pearsonr
elif method == 'spearman':
corr_func = spearmanr
else:
raise ValueError("Invalid correlation method '%s'." % method)
if permutations < 0:
raise ValueError("Number of permutations must be greater than or "
"equal to zero.")
if alternative not in ('two-sided', 'greater', 'less'):
raise ValueError("Invalid alternative hypothesis '%s'." % alternative)
x, y = _order_dms(x, y, strict=strict, lookup=lookup)
n = x.shape[0]
if n < 3:
raise ValueError("Distance matrices must have at least 3 matching IDs "
"between them (i.e., minimum 3x3 in size).")
x_flat = x.condensed_form()
y_flat = y.condensed_form()
orig_stat = corr_func(x_flat, y_flat)[0]
if permutations == 0 or np.isnan(orig_stat):
p_value = np.nan
else:
perm_gen = (corr_func(x.permute(condensed=True), y_flat)[0]
for _ in range(permutations))
permuted_stats = np.fromiter(perm_gen, np.float, count=permutations)
if alternative == 'two-sided':
count_better = (np.absolute(permuted_stats) >=
np.absolute(orig_stat)).sum()
elif alternative == 'greater':
count_better = (permuted_stats >= orig_stat).sum()
else:
count_better = (permuted_stats <= orig_stat).sum()
p_value = (count_better + 1) / (permutations + 1)
return orig_stat, p_value, n
@experimental(as_of="0.4.0")
def pwmantel(dms, labels=None, method='pearson', permutations=999,
alternative='two-sided', strict=True, lookup=None):
"""Run Mantel tests for every pair of given distance matrices.
Runs a Mantel test for each pair of distance matrices and collates the
results in a ``DataFrame``. Distance matrices do not need to be in the same
ID order if they are ``DistanceMatrix`` instances. Distance matrices will
be re-ordered prior to running each pairwise test, and if ``strict=False``,
IDs that don't match between a pair of distance matrices will be dropped
prior to running the test (otherwise a ``ValueError`` will be raised if
there are nonmatching IDs between any pair of distance matrices).
Parameters
----------
dms : iterable of DistanceMatrix objects, array_like objects, or filepaths
to distance matrices. If they are ``array_like``, no reordering or
matching of IDs will be performed.
labels : iterable of str or int, optional
Labels for each distance matrix in `dms`. These are used in the results
``DataFrame`` to identify the pair of distance matrices used in a
pairwise Mantel test. If ``None``, defaults to monotonically-increasing
integers starting at zero.
method : {'pearson', 'spearman'}
Correlation method. See ``mantel`` function for more details.
permutations : int, optional
Number of permutations. See ``mantel`` function for more details.
alternative : {'two-sided', 'greater', 'less'}
Alternative hypothesis. See ``mantel`` function for more details.
strict : bool, optional
Handling of nonmatching IDs. See ``mantel`` function for more details.
lookup : dict, optional
Map existing IDs to new IDs. See ``mantel`` function for more details.
Returns
-------
pandas.DataFrame
``DataFrame`` containing the results of each pairwise test (one per
row). Includes the number of objects considered in each test as column
``n`` (after applying `lookup` and filtering nonmatching IDs if
``strict=False``). Column ``p-value`` will display p-values as ``NaN``
if p-values could not be computed (they are stored as ``np.nan`` within
the ``DataFrame``; see ``mantel`` for more details).
See Also
--------
mantel
DistanceMatrix.read
Notes
--------
Passing a list of filepaths can be useful as it allows for a smaller amount
of memory consumption as it only loads two matrices at a time as opposed to
loading all distance matrices into memory.
Examples
--------
Import the functionality we'll use in the following examples:
>>> from skbio import DistanceMatrix
>>> from skbio.stats.distance import pwmantel
Define three 3x3 distance matrices:
>>> x = DistanceMatrix([[0, 1, 2],
... [1, 0, 3],
... [2, 3, 0]])
>>> y = DistanceMatrix([[0, 2, 7],
... [2, 0, 6],
... [7, 6, 0]])
>>> z = DistanceMatrix([[0, 5, 6],
... [5, 0, 1],
... [6, 1, 0]])
Run Mantel tests for each pair of distance matrices (there are 3 possible
pairs):
>>> pwmantel((x, y, z), labels=('x', 'y', 'z'),
... permutations=0) # doctest: +NORMALIZE_WHITESPACE
statistic p-value n method permutations alternative
dm1 dm2
x y 0.755929 NaN 3 pearson 0 two-sided
z -0.755929 NaN 3 pearson 0 two-sided
y z -0.142857 NaN 3 pearson 0 two-sided
Note that we passed ``permutations=0`` to suppress significance tests; the
p-values in the output are labelled ``NaN``.
"""
num_dms = len(dms)
if num_dms < 2:
raise ValueError("Must provide at least two distance matrices.")
if labels is None:
labels = range(num_dms)
else:
if num_dms != len(labels):
raise ValueError("Number of labels must match the number of "
"distance matrices.")
if len(set(labels)) != len(labels):
raise ValueError("Labels must be unique.")
num_combs = scipy.misc.comb(num_dms, 2, exact=True)
results_dtype = [('dm1', object), ('dm2', object), ('statistic', float),
('p-value', float), ('n', int), ('method', object),
('permutations', int), ('alternative', object)]
results = np.empty(num_combs, dtype=results_dtype)
for i, pair in enumerate(combinations(zip(labels, dms), 2)):
(xlabel, x), (ylabel, y) = pair
if isinstance(x, str):
x = DistanceMatrix.read(x)
if isinstance(y, str):
y = DistanceMatrix.read(y)
stat, p_val, n = mantel(x, y, method=method, permutations=permutations,
alternative=alternative, strict=strict,
lookup=lookup)
results[i] = (xlabel, ylabel, stat, p_val, n, method, permutations,
alternative)
return pd.DataFrame.from_records(results, index=('dm1', 'dm2'))
def _order_dms(x, y, strict=True, lookup=None):
"""Intersect distance matrices and put them in the same order."""
x_is_dm = isinstance(x, DistanceMatrix)
y_is_dm = isinstance(y, DistanceMatrix)
if (x_is_dm and not y_is_dm) or (y_is_dm and not x_is_dm):
raise TypeError(
"Mixing DistanceMatrix and array_like input types is not "
"supported. Both x and y must either be DistanceMatrix instances "
"or array_like, but not mixed.")
elif x_is_dm and y_is_dm:
if lookup is not None:
x = _remap_ids(x, lookup, 'x', 'first')
y = _remap_ids(y, lookup, 'y', 'second')
id_order = [id_ for id_ in x.ids if id_ in y]
num_matches = len(id_order)
if (strict and ((num_matches != len(x.ids)) or
(num_matches != len(y.ids)))):
raise ValueError("IDs exist that are not in both distance "
"matrices.")
if num_matches < 1:
raise ValueError("No matching IDs exist between the distance "
"matrices.")
return x.filter(id_order), y.filter(id_order)
else:
# Both x and y aren't DistanceMatrix instances.
if lookup is not None:
raise ValueError("ID lookup can only be provided if inputs are "
"DistanceMatrix instances.")
x = DistanceMatrix(x)
y = DistanceMatrix(y)
if x.shape != y.shape:
raise ValueError("Distance matrices must have the same shape.")
return x, y
def _remap_ids(dm, lookup, label, order):
"Return a copy of `dm` with its IDs remapped based on `lookup`."""
try:
remapped_ids = [lookup[id_] for id_ in dm.ids]
except KeyError as e:
raise KeyError("All IDs in the %s distance matrix (%s) must be in "
"the lookup. Missing ID: %s" % (order, label, str(e)))
# Create a copy as we'll be modifying the IDs in place.
dm_copy = dm.copy()
dm_copy.ids = remapped_ids
return dm_copy