/
moran.R
244 lines (238 loc) · 10.8 KB
/
moran.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
#' The Moran coefficient (Moran's I)
#'
#' @description The Moran coefficient, a measure of spatial autocorrelation (also known as Global Moran's I)
#'
#' @export
#'
#' @param x Numeric vector of input values, length n.
#' @param w An n x n spatial connectivity matrix. See \code{\link[geostan]{shape2mat}}.
#' @param digits Number of digits to round results to.
#' @param warn If `FALSE`, no warning will be printed to inform you when observations with zero neighbors or `NA` values have been dropped.
#' @param na.rm If `na.rm = TRUE`, observations with `NA` values will be dropped from both `x` and `w`.
#' @return The Moran coefficient, a numeric value.
#'
#' @details
#'
#' The formula for the Moran coefficient (MC) is
#' \deqn{MC = \frac{n}{K}\frac{\sum_i \sum_j w_{ij} (y_i - \overline{y})(y_j - \overline{y})}{\sum_i (y_i - \overline{y})^2}}
#' where \eqn{n} is the number of observations and \eqn{K} is the sum of all values in the spatial connectivity matrix \eqn{W}, i.e., the sum of all row-sums: \eqn{K = \sum_i \sum_j w_{ij}}.
#'
#' If any observations with no neighbors are found (i.e. \code{any(Matrix::rowSums(w) == 0)}) they will be dropped automatically and a message will print stating how many were dropped. The alternative is for those observations to have a spatial lage of zero---but zero is not a neutral value, see the Moran scatter plot.
#'
#' @seealso \link[geostan]{moran_plot}, \link[geostan]{lisa}, \link[geostan]{aple}, \link[geostan]{gr}, \link[geostan]{lg}
#'
#' @examples
#' library(sf)
#' data(georgia)
#' w <- shape2mat(georgia, style = "W")
#' x <- georgia$ICE
#' mc(x, w)
#' @source
#'
#' Chun, Yongwan, and Daniel A. Griffith. Spatial Statistics and Geostatistics: Theory and Applications for Geographic Information Science and Technology. Sage, 2013.
#'
#' Cliff, Andrew David, and J. Keith Ord. Spatial processes: models & applications. Taylor & Francis, 1981.
#'
#' @importFrom Matrix rowSums
#'
mc <- function(x, w, digits = 3, warn = TRUE, na.rm = FALSE) {
check_sa_data(x, w)
na_idx <- which(is.na(x))
if (na.rm == TRUE && length(na_idx) > 0) {
if (warn) message(length(na_idx), " NA values found in x; they will be removed from the data before calculating the Moran coefficient. If matrix w was row-standardized, it may not longer be. You may want to use a binary connectivity matrix by using style = 'B' in shape2mat.")
x <- x[-na_idx]
w <- w[-na_idx, -na_idx]
}
if (any(Matrix::rowSums(w) == 0)) {
zero.idx <- which(Matrix::rowSums(w) == 0)
if (warn) message(length(zero.idx), " observations with no neighbors found. They will be removed from the data before calculating the Moran coefficient.")
x <- x[-zero.idx]
w <- w[-zero.idx, -zero.idx]
}
xbar <- mean(x)
z <- x - xbar
ztilde <- as.numeric(w %*% z)
A <- sum(Matrix::rowSums(w))
n <- length(x)
mc <- as.numeric( n/A * (z %*% ztilde) / (z %*% z))
return(round(mc, digits = digits))
}
#' Moran scatter plot
#'
#' @description Plots a set of values against their spatially lagged values and gives the Moran coefficient as a measure of spatial autocorrelation.
#'
#' @export
#'
#' @import ggplot2
#' @param x A numeric vector of length n.
#' @param w An n x n spatial connectivity matrix.
#' @param xlab Label for the x-axis.
#' @param ylab Label for the y-axis.
#' @param pch Symbol type.
#' @param col Symbol color.
#' @param size Symbol size.
#' @param alpha Symbol transparency.
#' @param lwd Width of the regression line.
#' @param na.rm If `na.rm = TRUE`, any observations of `x` with `NA` values will be dropped from `x` and from `w`.
#' @details For details on the symbol parameters see the documentation for \link[ggplot2]{geom_point}.
#'
#' If any observations with no neighbors are found (i.e. \code{any(Matrix::rowSums(w) == 0)}) they will be dropped automatically and a message will print stating how many were dropped.
#'
#' @return Returns a \code{gg} plot, a scatter plot with \code{x} on the horizontal and its spatially lagged values on the vertical axis (i.e. a Moran scatter plot).
#'
#' @seealso \link[geostan]{mc}, \link[geostan]{lisa}, \link[geostan]{aple}
#'
#' @source
#'
#' Anselin, Luc. "Local indicators of spatial association—LISA." Geographical analysis 27, no. 2 (1995): 93-115.
#'
#' @examples
#' data(georgia)
#' x <- georgia$income
#' w <- shape2mat(georgia, "W")
#' moran_plot(x, w)
#' @import ggplot2
#' @importFrom signs signs
#' @importFrom Matrix rowSums
moran_plot <- function(x, w, xlab = "x (centered)", ylab = "Spatial Lag", pch = 20, col = "darkred", size = 2, alpha = 1, lwd = 0.5, na.rm = FALSE) {
check_sa_data(x, w)
na_idx <- which(is.na(x))
if (length(na_idx) > 0) {
if (na.rm == TRUE) {
msg <- paste(length(na_idx),
"NA values found in x will be dropped from data x and matrix w"
)
if (!inherits(w, "ngCMatrix")) msg <- paste(msg,
"\n If you are using a row-standardized matrix: dropping the NA values means that some rows in matrix w do not sum to 1; you may want to remove missing observations manually and then make row-standardized w, or use a binary matrix (see ?shape2mat)."
)
message(msg)
x <- x[-na_idx]
w <- w[-na_idx, -na_idx]
} else {
stop("NA values found in moran_plot(x, ...). Use moran_plot(x, na.rm = TRUE, ...)")
}
}
if (any(Matrix::rowSums(w) == 0)) {
zero.idx <- which(Matrix::rowSums(w) == 0)
message(length(zero.idx), " observations with no neighbors found. They will be dropped from the data before creating the Moran plot.")
x <- x[-zero.idx]
w <- w[-zero.idx, -zero.idx]
}
x <- x - mean(x)
xlag <- as.numeric(w %*% x)
sub <- paste0("MC = ", round(mc(x, w),3))
ggplot(data.frame(x = x,
xlag = xlag),
aes(x = x, y = xlag)
) +
geom_hline(yintercept = mean(xlag),
lty = 3) +
geom_vline(xintercept = mean(x),
lty = 3) +
geom_point(
pch = 20,
colour = col,
size = size,
alpha = alpha,
aes(x = x,
y = xlag)
) +
geom_smooth(aes(x = x,
y = xlag),
method = "lm",
formula = "y ~ x",
lwd = lwd,
col = "black",
se = FALSE) +
scale_x_continuous(labels = signs::signs) +
scale_y_continuous(labels = signs::signs) +
labs(x = xlab,
y = ylab,
subtitle = sub) +
theme_classic()
}
#' Local Moran's I
#'
#' @export
#' @description A local indicator of spatial association (LISA) based on Moran's I (the Moran coefficient) for exploratory data analysis.
#'
#' @param x Numeric vector of length `n`.
#' @param w An `n x n` spatial connectivity matrix. See \link[geostan]{shape2mat}. If \code{w} is not row standardized (\code{all(Matrix::rowSums(w) == 1)}), it will automatically be row-standardized.
#' @param type Return the type of association also (High-High, Low-Low, High-Low, and Low-High)? Defaults to \code{FALSE}.
#' @param scale If `TRUE`, then `x` will automatically be standardized using `scale(x, center = TRUE, scale = TRUE)`. If `FALSE`, then the variate will be centered but not scaled, using `scale(x, center = TRUE, scale = FALSE)`.
#' @param digits Number of digits to round results to.
#' @details
#'
#' The values of `x` will automatically be centered first with \code{z = scale(x, center = TRUE, scale = scale)} (with user control over the `scale` argument). The LISA values are the product of each \code{z} value with the weighted sum of their respective surrounding value: \deqn{I_i = z_i \sum_j w_{ij} z_j} (or in R code: \code{lisa = z * (w \%*\% z)}). These are for exploratory analysis and model diagnostics.
#'
#' An above-average value (i.e. positive z-value) with positive mean spatial lag indicates local positive spatial autocorrelation and is designated type "High-High"; a low value surrounded by high values indicates negative spatial autocorrelation and is designated type "Low-High", and so on.
#'
#' This function uses Equation 7 from Anselin (1995). Note that the `spdep` package uses Formula 12, which divides the same value by a constant term \eqn{\sum_i z_i^2/n}. So the `geostan` version can be made equal to the `spdep` version by dividing by that value.
#'
#' @return If \code{type = FALSE} a numeric vector of lisa values for exploratory analysis of local spatial autocorrelation. If \code{type = TRUE}, a \code{data.frame} with columns \code{Li} (the lisa value) and \code{type}.
#'
#' @seealso \code{\link[geostan]{moran_plot}}, \code{\link[geostan]{mc}}, \code{\link[geostan]{aple}}, \code{\link[geostan]{lg}}, \code{\link[geostan]{gr}}
#'
#' @source
#'
#' Anselin, Luc. "Local indicators of spatial association—LISA." Geographical Analysis 27, no. 2 (1995): 93-115.
#'
#' @examples
#' library(ggplot2)
#' library(sf)
#' data(georgia)
#' w <- shape2mat(georgia, "W")
#' x <- georgia$ICE
#' li = lisa(x, w)
#' head(li)
#' ggplot(georgia, aes(fill = li$Li)) +
#' geom_sf() +
#' scale_fill_gradient2()
#' @importFrom Matrix rowSums
lisa <- function(x, w, type = TRUE, scale = TRUE, digits = 3) {
check_sa_data(x, w)
z <- scale(x, center = TRUE, scale = scale)
lagz <- as.numeric(w %*% z)
zi <- as.numeric(z * lagz)
if (any(Matrix::rowSums(w) == 0)) {
zero.idx <- which(Matrix::rowSums(w) == 0)
zi[zero.idx] <- NA
message(length(zero.idx), " observations with no neighbors found. Their LISA values are being set to NA.")
}
zi <- round(zi, digits)
if (!type) return (zi)
type <- ifelse(z > 0 & lagz > 0, "HH",
ifelse(z < 0 & lagz > 0, "LH",
ifelse(z < 0 & lagz < 0, "LL",
ifelse(z > 0 & lagz < 0, "HL", NA
))))
return (data.frame(Li = zi, type = type))
}
#' Expected value of the residual Moran coefficient
#'
#' @description Expected value for the Moran coefficient of model residuals under the null hypothesis of no spatial autocorrelation.
#'
#' @export
#'
#' @param X model matrix, including column of ones.
#' @param C Connectivity matrix.
#'
#' @source
#'
#' Chun, Yongwan and Griffith, Daniel A. (2013). Spatial statistics and geostatistics. Sage, p. 18.
#'
#' @return Returns a numeric value.
#'
#' @examples
#' data(georgia)
#' C <- shape2mat(georgia)
#' X <- model.matrix(~ ICE + college, georgia)
#' expected_mc(X, C)
expected_mc <- function(X, C) {
C <- as.matrix(C)
n = nrow(X)
k = ncol(X)
under <- (n-k) * sum(rowSums(C))
mc = -n * sum(diag( solve(t(X) %*% X) %*% t(X) %*% C %*% X )) / under
return(as.numeric(mc))
}