/
redundancy_analysis.py
233 lines (198 loc) · 9.21 KB
/
redundancy_analysis.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
# ----------------------------------------------------------------------------
# Copyright (c) 2013--, scikit-bio development team.
#
# Distributed under the terms of the Modified BSD License.
#
# The full license is in the file COPYING.txt, distributed with this software.
# ----------------------------------------------------------------------------
from __future__ import absolute_import, division, print_function
import numpy as np
from .base import Ordination, OrdinationResults
from .utils import corr, svd_rank, scale
class RDA(Ordination):
r"""Compute redundancy analysis, a type of canonical analysis.
It is related to PCA and multiple regression because the explained
variables `Y` are fitted to the explanatory variables `X` and PCA
is then performed on the fitted values. A similar process is
performed on the residuals.
RDA should be chosen if the studied gradient is small, and CCA
when it's large, so that the contingency table is sparse.
Parameters
----------
Y : array_like
:math:`n \times p` response matrix. Its columns need be
dimensionally homogeneous (or you can set `scale_Y=True`).
X : array_like
:math:`n \times m, n \geq m` matrix of explanatory
variables. Its columns need not be standardized, but doing so
turns regression coefficients into standard regression
coefficients.
scale_Y : bool, optional
Controls whether the response matrix columns are scaled to
have unit standard deviation. Defaults to `False`.
Notes
-----
The algorithm is based on [1]_, \S 11.1, and is expected to
give the same results as ``rda(Y, X)`` in R's package vegan.
See Also
--------
CCA
References
----------
.. [1] Legendre P. and Legendre L. 1998. Numerical
Ecology. Elsevier, Amsterdam.
"""
short_method_name = 'RDA'
long_method_name = 'Redundancy Analysis'
def __init__(self, Y, X, site_ids, species_ids, scale_Y=False):
self.Y = np.asarray(Y, dtype=np.float64)
self.X = np.asarray(X, dtype=np.float64)
self.site_ids = site_ids
self.species_ids = species_ids
self._rda(scale_Y)
def _rda(self, scale_Y):
n, p = self.Y.shape
n_, m = self.X.shape
if n != n_:
raise ValueError(
"Both data matrices must have the same number of rows.")
if n < m:
# Mmm actually vegan is able to do this case, too
raise ValueError(
"Explanatory variables cannot have less rows than columns.")
# Centre response variables (they must be dimensionally
# homogeneous)
Y = scale(self.Y, with_std=scale_Y)
# Centre explanatory variables
X = scale(self.X, with_std=False)
# Distribution of variables should be examined and transformed
# if necessary (see paragraph 4 in p. 580 L&L 1998)
# Compute Y_hat (fitted values by multivariate linear
# regression, that is, linear least squares). Formula 11.6 in
# L&L 1998 involves solving the normal equations, but that fails
# when cond(X) ~ eps**(-0.5). A more expensive but much more
# stable solution (fails when cond(X) ~ eps**-1) is computed
# using the QR decomposition of X = QR:
# (11.6) Y_hat = X [X' X]^{-1} X' Y
# = QR [R'Q' QR]^{-1} R'Q' Y
# = QR [R' R]^{-1} R'Q' Y
# = QR R^{-1} R'^{-1} R' Q' Y
# = Q Q' Y
# and B (matrix of regression coefficients)
# (11.4) B = [X' X]^{-1} X' Y
# = R^{-1} R'^{-1} R' Q' Y
# = R^{-1} Q'
# Q, R = np.linalg.qr(X)
# Y_hat = Q.dot(Q.T).dot(Y)
# B = scipy.linalg.solve_triangular(R, Q.T.dot(Y))
# This works provided X has full rank. When not, you can still
# fix it using R's pseudoinverse or partitioning R. To avoid any
# issues, like the numerical instability when trying to
# reproduce an example in L&L where X was rank-deficient, we'll
# just use `np.linalg.lstsq`, which uses the SVD decomposition
# under the hood and so it's also more expensive.
B, _, rank_X, _ = np.linalg.lstsq(X, Y)
Y_hat = X.dot(B)
# Now let's perform PCA on the fitted values from the multiple
# regression
u, s, vt = np.linalg.svd(Y_hat, full_matrices=False)
# vt are the right eigenvectors, which is what we need to
# perform PCA. That is, we're changing points in Y_hat from the
# canonical basis to the orthonormal basis given by the right
# eigenvectors of Y_hat (or equivalently, the eigenvectors of
# the covariance matrix Y_hat.T.dot(Y_hat))
# See 3) in p. 583 in L&L 1998
rank = svd_rank(Y_hat.shape, s)
# Theoretically, there're at most min(p, m, n - 1) non-zero eigenvalues
U = vt[:rank].T # U as in Fig. 11.2
# Ordination in the space of response variables. Its columns are
# site scores. (Eq. 11.12)
F = Y.dot(U)
# Ordination in the space of explanatory variables. Its columns
# are fitted site scores. (Eq. 11.13)
Z = Y_hat.dot(U)
# Canonical coefficients (formula 11.14)
# C = B.dot(U) # Not used
Y_res = Y - Y_hat
# PCA on the residuals
u_res, s_res, vt_res = np.linalg.svd(Y_res, full_matrices=False)
# See 9) in p. 587 in L&L 1998
rank_res = svd_rank(Y_res.shape, s_res)
# Theoretically, there're at most min(p, n - 1) non-zero eigenvaluesas
U_res = vt_res[:rank_res].T
F_res = Y_res.dot(U_res) # Ordination in the space of residuals
# Storing values needed to compute scores
iter_ = (('U', U), ('U_res', U_res),
('F', F),
('F_res', F_res),
('Z', Z),
('u', u[:, :rank]))
for val_name, val in iter_:
setattr(self, val_name, val)
self.eigenvalues = np.r_[s[:rank], s_res[:rank_res]]
def scores(self, scaling):
"""Compute site, species and biplot scores for different scalings.
Parameters
----------
scaling : int
Scaling type 1 produces a distance biplot. It focuses on
the ordination of rows (sites) because their transformed
distances approximate their original euclidean
distances. Especially interesting when most explanatory
variables are binary.
Scaling type 2 produces a correlation biplot. It focuses
on the relationships among explained variables (`Y`). It
is interpreted like scaling type 1, but taking into
account that distances between objects don't approximate
their euclidean distances.
See more details about distance and correlation biplots in
[1]_, \S 9.1.4.
Returns
-------
OrdinationResults
Object that stores the computed eigenvalues, the
proportion explained by each of them (per unit),
transformed coordinates for species and sites, biplot
scores, site constraints, etc.
See Also
--------
OrdinationResults
References
----------
.. [1] Legendre P. and Legendre L. 1998. Numerical
Ecology. Elsevier, Amsterdam.
"""
if scaling not in {1, 2}:
raise NotImplementedError("Only scalings 1, 2 available for RDA.")
# According to the vegan-FAQ.pdf, the scaling factor for scores
# is (notice that L&L 1998 says in p. 586 that such scaling
# doesn't affect the interpretation of a biplot):
eigvals = self.eigenvalues
const = np.sum(eigvals**2)**0.25
if scaling == 1:
scaling_factor = const
elif scaling == 2:
scaling_factor = eigvals / const
species_scores = np.hstack((self.U, self.U_res)) * scaling_factor
site_scores = np.hstack((self.F, self.F_res)) / scaling_factor
# TODO not yet used/displayed
site_constraints = np.hstack((self.Z, self.F_res)) / scaling_factor
# vegan seems to compute them as corr(self.X[:, :rank_X],
# self.u) but I don't think that's a good idea. In fact, if
# you take the example shown in Figure 11.3 in L&L 1998 you
# can see that there's an arrow for each of the 4
# environmental variables (depth, coral, sand, other) even if
# other = not(coral or sand)
biplot_scores = corr(self.X, self.u)
# The "Correlations of environmental variables with site
# scores" from table 11.4 are quite similar to vegan's biplot
# scores, but they're computed like this:
# corr(self.X, self.F))
return OrdinationResults(eigvals=eigvals,
proportion_explained=eigvals / eigvals.sum(),
species=species_scores,
site=site_scores,
biplot=biplot_scores,
site_constraints=site_constraints,
site_ids=self.site_ids,
species_ids=self.species_ids)