-
Notifications
You must be signed in to change notification settings - Fork 7
/
postestimate_test_hausman.R
211 lines (183 loc) · 7.42 KB
/
postestimate_test_hausman.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
#' Regression-based Hausman test
#'
#' \lifecycle{experimental}
#'
#' Calculates the regression-based Hausman test to be used to compare
#' OLS to 2SLS estimates or 2SLS to 3SLS estimates. See e.g., \insertCite{Wooldridge2010;textual}{cSEM}
#' (pages 131 f.) for details.
#'
#' The function is somewhat experimental. Only use if you know what you are doing.
#'
#' @usage testHausman(
#' .object = NULL,
#' .eval_plan = c("sequential", "multicore", "multisession"),
#' .handle_inadmissibles = c("drop", "ignore", "replace"),
#' .R = 499,
#' .resample_method = c("bootstrap", "jackknife"),
#' .seed = NULL
#' )
#'
#' @inheritParams csem_arguments
#'
#' @references
#' \insertAllCited{}
#'
#' @seealso [csem()], [cSEMResults]
#'
#' @example inst/examples/example_testHausman.R
#'
#' @export
testHausman <- function(
.object = NULL,
.eval_plan = c("sequential", "multicore", "multisession"),
.handle_inadmissibles = c("drop", "ignore", "replace"),
.R = 499,
.resample_method = c("bootstrap", "jackknife"),
.seed = NULL
) {
.eval_plan <- match.arg(.eval_plan)
.handle_inadmissibles <- match.arg(.handle_inadmissibles)
.resample_method <- match.arg(.resample_method)
if(!inherits(.object, "cSEMResults_default")) {
stop2("Hausman test currently only available for linear models.")
}
# Define function to bootstrap
fun_residuals <- function(.object) {
## Extract relevant quantities
m <- .object$Information$Model
P <- .object$Estimates$Construct_VCV
# Dependent (LHS) variables of the structural equations
dep_vars <- rownames(m$structural)[rowSums(m$structural) != 0]
res <- lapply(dep_vars, function(y) {
# Which of the variables in dep_vars have instruments specified, i.e.
# have endogenous variables on the RHS. By default: FALSE.
endo_in_RHS <- FALSE
if(!is.null(m$instruments)) {
endo_in_RHS <- y %in% names(m$instruments)
} else {
stop2("The following error occured in the testHausman() function:\n",
"Instruments required.")
}
# All independent variables (X) of the structural equation of construct y
# (including the endogenous RHS variables)
names_X <- colnames(m$structural[y, m$structural[y, ] != 0, drop = FALSE])
## Only for equations with endogenous variables on the RHS do we need to compute
## the Hausman test.
if(endo_in_RHS & (.object$Information$Arguments$.approach_paths == "2SLS")) {
## First stage
# Note: Technically, we only need to regress the P endogenous variables
# (y1) on the L instruments and the K exogenous independent variables
# (which must be part of Z).
# Therefore: y1 (N x P) and Z (N x (L + K)) and
# beta_1st = (Z'Z)^-1*(Z'y1)
#
# beta_1st would be ((L + K) x P), i.e. each columns represents the
# first stage estimates for a regression of the instruments on
# on the p'th endogenous variable.
names_endo <- rownames(m$instruments[[y]]) # names of the endogenous variables (y1's)
names_Z <- colnames(m$instruments[[y]]) # names of the instruments
# Assuming that P (the construct correlation matrix) also contains
# the instruments (ensured if only internal instruments are allowed)
beta_1st <- solve(P[names_Z, names_Z, drop = FALSE],
P[names_Z, names_endo, drop = FALSE])
## Second stage
# Note: we have to use (construct) correlations here since using
# the plain score would yield inconsistent estimates for concepts
# modeled as common factors. Hence we need to "translate" the
# regression based second stage estimation
# y = X * beta_1 + v_hat*beta_2 + u
# and the estimator
# beta_2nd = (W'W)^-1W'y
# into correlations.
#
# y1_hat = Z*beta_1st
# v_hat = y1 - y1_hat = y1 - Z*beta_1st
#
# y = X * beta_1 + v_hat*beta_2 + u
#
# Define W = [X; v_hat]
#
# E(W'W ) = E[X'X ; X'v_hat
# v_hat'X ; v_hat'v_hat]
# E(W'y) = E[X'y ; v_hat'y]'
ww11 <- P[names_X, names_X, drop = FALSE]
ww12 <- P[names_X, names_endo, drop = FALSE] - P[names_X, names_Z, drop = FALSE] %*% beta_1st
ww21 <- t(ww12)
ww22 <- P[names_endo, names_endo, drop = FALSE] -
P[names_endo, names_Z, drop = FALSE] %*% beta_1st -
t(beta_1st) %*% P[names_Z, names_endo, drop = FALSE] +
t(beta_1st) %*% P[names_Z, names_Z, drop = FALSE] %*% beta_1st
WW <- rbind(cbind(ww11, ww12), cbind(ww21, ww22))
wy1 <- P[names_X, y, drop = FALSE]
wy2 <- P[names_endo, y, drop = FALSE] - t(beta_1st) %*% P[names_Z, y, drop = FALSE]
Wy <- rbind(wy1, wy2)
## Estimate the second stage estimation
beta_2nd <- solve(WW, Wy)
rownames(beta_2nd) <- c(names_X, paste0("Resid_", names_endo))
colnames(beta_2nd) <- y
return(beta_2nd)
} else {
NA
}
})
names(res) <- dep_vars
res <- Filter(Negate(anyNA), res)
# Vectorize "res" to be able to use resamplecSEMResults (which requires
# output to be a vector or a matrix)
# 1. Assign a unique name to each element
res <- lapply(res, function(x) {
rownames(x) <- paste0(colnames(x), "_", rownames(x))
x
})
# 2. Combine, vectorize and assign names
res2 <- do.call(rbind, res)
res <- c(res2)
names(res) <- rownames(res2)
res
}
## Resample to get standard errors
out <- resamplecSEMResults(
.object = .object,
.resample_method = .resample_method,
.R = .R,
.handle_inadmissibles = .handle_inadmissibles,
.user_funs = fun_residuals,
.eval_plan = .eval_plan,
.seed = .seed
)
## Get relevant quantities
out_infer <- infer(out)
beta <- out$Estimates$Estimates_resample$Estimates1$User_fun$Original
se <- out_infer$User_fun$sd
t <- beta/se
p_normal <- 2*pnorm(abs(t), mean = 0, sd = 1, lower.tail = FALSE)
# ci_percentile <- out_infer$User_fun$CI_percentile
out_data_frame <- data.frame(
"Dep_construct" = rep(out$Information$Model$cons_endo,
sapply(out$Information$Model$instruments, sum)),
"Name" = names(beta),
"Estimate" = beta,
"Std_error" = se,
"t_stat" = t,
"p_value" = p_normal,
# "Ci_perc_L" = ci_percentile[1, ],
# "Ci_perc_H" = ci_percentile[2, ],
stringsAsFactors = FALSE
)
rownames(out_data_frame) <- NULL
## Split data frames
out_temp <- split(out_data_frame, out_data_frame$Dep_construct)
for(i in names(out_temp)) {
out_temp[[i]] <- out_temp[[i]][, -1] # remove Dep_construct column
out_temp[[i]][, "Name"] <- gsub(paste0(i, "_"), "", out_temp[[i]][, "Name"])
}
out <- list(
"Regressions" = out_temp,
"Information" = list(
"Seed" = .seed,
"Number_of_runs" = .R
)
)
class(out) <- "cSEMTestHausman"
return(out)
}