/
09-supporting-information-C.R
165 lines (146 loc) · 6.4 KB
/
09-supporting-information-C.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
#===============================================================================
# 09-supporting-info-C.R
# Purpose: to replicate Table 3 of the paper, where we provide a summary of the
# hitchhiker discovered at each stage of the discovering process, as
# well as information about the best performing models.
# Article: "More Effective Than We Thought: Accounting for Legislative
# Hitchhikers Reveals a More Inclusive and Productive Lawmaking
# Process."
# Journal: American Journal of Political Science
# Year: 2018
# Authors: Andreu Casas, Matt Denny, and John Wilkerson
#===============================================================================
# PACKAGES
#===============================================================================
# - install packages if needed
install.packages("dplyr")
# - load the packages
library(dplyr)
# PATHS & CONSTANTS
#===============================================================================
# - path to where the predictions made during the hitchhiker discovering process
# are located
pred_path <- "../data/predictions/"
# MAIN
#===============================================================================
# - initializing final summary dataset
results <- NULL
# ========== Iteration 1 ===========
iter = 1
# - reading the training set and splitting the set into True positives and True
# Negatives
labs_db <- read.csv("../data/hr146_bi80_uni90_labeled.csv")
Tpos <- as.numeric(table(labs_db$true_match_num2)[2])
Tneg <- as.numeric(table(labs_db$true_match_num2)[1])
# - reading the information about the models and parameters in stage 1 and 2 of
# the discovering process
# ... the first best models selected in stage 01 (before adding extra constraints)
best_models <- read.csv(paste0(pred_path, "best_models1.csv"))
stage1_n <- nrow(best_models)
# ... applying the extra constraints to select the best performing models in
# stage 1
stage1_par <- min(c(min(best_models$precision_pe),
min(best_models$recall_pe)))
# ... reading the information for these best performing models in stage 1
best_best_models <- read.csv(paste0(pred_path, "best_best_models1.csv"))
stage2_n <- nrow(best_best_models)
stage2_par <- 10
# ... reading the information about the hitchhiker predicted in the first stage
preds <- read.csv(paste0(pred_path, "hitchhiker_predictions_stage01.csv"))
preds_n <- nrow(preds)
# ... reading the information about the training set in the second stage of the
# process
next_labs_n <- nrow(read.csv(paste0(pred_path, "labs_db_iter_2a.csv")))
new_preds_n <- next_labs_n - nrow(labs_db)
old_preds_n <- preds_n - new_preds_n
# ... calculating ensemble precision and recall
ensemble_pr <- NA
ensemble_rec <- NA
new_row <- data.frame(
iter = iter,
training_size = Tpos + Tneg,
Tpos = Tpos,
Tneg = Tneg,
stage1_n = stage1_n,
stage1_par = stage1_par,
stage2_n = stage2_n,
stage2_par = stage2_par,
preds_n = preds_n,
old_preds_n = old_preds_n,
new_preds_n = new_preds_n,
ensemble_pr = ensemble_pr,
ensemble_rec = ensemble_rec
)
results <- rbind(results, new_row)
# - building the training set for the next iteration of the process (#2)
labs_db$BillID_a <- as.character(sapply(as.character(labs_db$version_a), function(x)
paste0(strsplit(x, split = "-")[[1]][1:3], collapse = "-")))
labs_db$BillID_b <- as.character(sapply(as.character(labs_db$version_b), function(x)
paste0(strsplit(x, split = "-")[[1]][1:3], collapse = "-")))
labs_db$comp2 <- paste0(labs_db$BillID_a, "&", labs_db$BillID_b)
# ======== Iteration 2 to 4 ========
# - looping through the data of iter 2, 3, and 4.
for (i in 2:4){
iter = i
fname = paste0("iter_", iter, ".csv")
# - training set
labs_db <- read.csv(paste0(pred_path, "labs_db_", fname))
Tpos <- as.numeric(table(labs_db$true_match_num2)[2])
Tneg <- as.numeric(table(labs_db$true_match_num2)[1])
# - models and parameters for this stage
best_models <- read.csv(paste0(pred_path, "best_models_", fname))
stage1_n <- nrow(best_models)
stage1_par <- min(c(min(best_models$precision_pe),
min(best_models$recall_pe)))
best_best_models <- read.csv(paste0(pred_path, "best_best_models_", fname))
stage2_n <- nrow(best_best_models)
stage2_par <- max(best_best_models$bill_mult_match)
# - predictions: old and new
preds <- read.csv(paste0(pred_path, "ensemble_preds_", fname))
preds_n <- nrow(preds)
if (!("comp2" %in% names(labs_db))) {
labs_db$version_a <- as.character(sapply(as.character(labs_db$comp), function(x)
strsplit(x, split = "&")[[1]][1]))
labs_db$version_b <- as.character(sapply(as.character(labs_db$comp), function(x)
strsplit(x, split = "&")[[1]][2]))
labs_db$BillID_a <- as.character(sapply(as.character(labs_db$version_a), function(x)
paste0(strsplit(x, split = "-")[[1]][1:3], collapse = "-")))
labs_db$BillID_b <- as.character(sapply(as.character(labs_db$version_b), function(x)
paste0(strsplit(x, split = "-")[[1]][1:3], collapse = "-")))
labs_db$comp2 <- paste0(labs_db$BillID_a, "&", labs_db$BillID_b)
}
labs_db2 <- labs_db %>% dplyr::select(comp2, true_match_num2)
labs_db2$comp2 <- as.character(labs_db2$comp2)
preds$comp2 <- as.character(preds$comp2)
preds <- left_join(preds, labs_db2)
preds <- preds %>%
mutate(true_match_num2 = ifelse(is.na(true_match_num2), 99, true_match_num2))
new_preds_n <- length(which(preds$true_match_num2 == 99))
old_preds_n <- nrow(preds) - new_preds_n
# - calculating ensemble precision and recall
crossval <- read.csv(paste0(pred_path, "crossval_res_", fname))
ensemble_pr <- paste0(crossval$pr_pe, "% {", round(crossval$pr_lwr,2), "-",
round(crossval$pr_upr,2), "}")
ensemble_rec <- paste0(crossval$rec_pe, "% {", round(crossval$rec_lwr,2), "-",
round(crossval$rec_upr,2), "}")
new_row <- data.frame(
iter = iter,
training_size = Tpos + Tneg,
Tpos = Tpos,
Tneg = Tneg,
stage1_n = stage1_n,
stage1_par = stage1_par,
stage2_n = stage2_n,
stage2_par = stage2_par,
preds_n = preds_n,
old_preds_n = old_preds_n,
new_preds_n = new_preds_n,
ensemble_pr = ensemble_pr,
ensemble_rec = ensemble_rec
)
results <- rbind(results, new_row)
}
# TABLE: information to plug into and replicate Table 3 of the article, in
# Supporting Information C.
#===============================================================================
print(results)