/
conditional-transform.R
251 lines (229 loc) · 9.77 KB
/
conditional-transform.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
#' Conditional transformation
#'
#' Create a new variable that has values when specific conditions are met.
#' Conditions are specified using a series of formulas: the left-hand side is
#' the condition that must be true (a `CrunchLogicalExpr`) and the right-hand
#' side is where to get the value if the condition on the left-hand side is
#' true. This is commonly a Crunch variable but may be a string or numeric
#' value, depending on the type of variable you're constructing.
#'
#' The type of the new variable can depend on the type(s) of the source
#' variable(s). By default (`type=NULL`), the type of the new variable will be
#' the type of all of the source variables (that is, if all of the source
#' variables are text, the new variable type will be text, if all of the
#' source variables are categorical, the new variable will be categorical).
#' If there are multiple types in the source variables, the result will be a
#' text variable. The default behavior can be overridden by specifying
#' `type = "categorical"`, `"text"`, or `"numeric"`.
#'
#' `conditionalTransform` is similar to `makeCaseVariable`; however,
#' `conditionalTransform` can use other Crunch variables as a source of a
#' variable, whereas, `makeCaseVariable` can only use characters. This
#' additional power comes at a cost: `makeCaseVariable` can be executed
#' entirely on Crunch servers, so no data needs to be downloaded or uploaded
#' to/from the local R session. `conditionalTransform` on the other hand will
#' download the data necessary to construct the new variable.
#'
#' @param ... a list of conditions to evaluate (as formulas, see Details) as well as
#' other properties to pass to the new conditional variable (i.e. alias, description)
#' @param data a Crunch dataset object to use
#' @param else_condition a default value to use if none of the conditions are
#' true (default: `NA`)
#' @param type a character that is either "categorical", "text", "numeric" what
#' type of output should be returned? If `NULL`, the type of the source
#' variable will be used. (default: `NULL`) The source variables will be
#' converted to this type if necessary.
#' @param categories a vector of characters if `type="categorical"`, these are
#' all of the categories that should be in the resulting variable, in the order
#' they should be in the resulting variable or a set of Crunch categories.
#' @param formulas a list of conditions to evaluate (as formulas, see Details). If
#' specified, `...` must not contain other formulas specifying conditions.
#'
#' @return a Crunch `VariableDefinition`
#' @examples
#' \dontrun{
#'
#' ds$cat_opinion <- conditionalTransform(pet1 == "Cat" ~ Opinion1,
#' pet2 == "Cat" ~ Opinion2,
#' pet3 == "Cat" ~ Opinion3,
#' data = ds,
#' name = "Opinion of Cats"
#' )
#' }
#' @export
conditionalTransform <- function(..., data, else_condition = NA, type = NULL,
categories = NULL, formulas = NULL) {
dots <- list(...)
is_formula <- function(x) inherits(x, "formula")
dot_formulas <- Filter(is_formula, dots)
if (length(dot_formulas) > 0) {
if (!is.null(formulas)) {
halt(
"Must not supply conditions in both the ", dQuote("formulas"),
" argument and ", dQuote("...")
)
}
formulas <- dot_formulas
}
var_def <- Filter(Negate(is_formula), dots)
if (length(formulas) == 0) {
halt(
"Conditions must be supplied: ",
"Have you forgotten to supply conditions as formulas in either the ",
dQuote("formulas"), " argument, or through ", dQuote("..."), ""
)
}
if (!missing(type) && !type %in% c("categorical", "text", "numeric")) {
halt(
"Type must be either ", dQuote("categorical"), ", ",
dQuote("text"), ", or ", dQuote("numeric")
)
}
conditional_vals <- makeConditionalValues(formulas, data, else_condition)
if (!missing(type)) {
if (type == "numeric") {
result <- as.numeric(conditional_vals$values)
} else {
result <- as.character(conditional_vals$values)
}
} else {
# determine type
result <- conditional_vals$values
type <- conditional_vals$type
}
if (type != "categorical" & !is.null(categories)) {
warning(
"Type is not ", dQuote("categorical"), " ignoring ",
dQuote("categories")
)
}
var_def$type <- type
# add categories if necessary
if (type == "categorical") {
# if categories are supplied and there are any
if (missing(categories)) {
result <- factor(result)
# make categories from names
categories <- Categories(data = categoriesFromLevels(levels(result)))
} else {
if (!is.categories(categories)) {
# if categories aren't a Categories object,
# make categories from names
categories <- Categories(data = categoriesFromLevels(categories))
}
uni_results <- unique(result[!is.na(result)])
results_not_categories <- !uni_results %in% names(categories)
if (any(results_not_categories)) {
halt(
"When specifying categories, all categories in the ",
"results must be included. These categories are in the ",
"results that were not specified in categories: ",
serialPaste(uni_results[results_not_categories])
)
}
result <- factor(result, levels = names(categories))
}
categories <- ensureNoDataCategory(categories)
# make a category list to send with VariableDefinition and then store
# that and convert values to ids values
category_list <- categories
var_def$categories <- category_list
vals <- as.character(result)
vals[is.na(vals)] <- "No Data" # na is system default
var_def$values <- ids(categories[vals])
} else {
var_def$values <- result
}
class(var_def) <- "VariableDefinition"
return(var_def)
}
makeConditionalValues <- function(formulas, data, else_condition) {
n <- length(formulas)
cases <- vector("list", n)
values <- vector("list", n)
for (i in seq_len(n)) {
formula <- formulas[[i]]
if (length(formula) != 3) {
halt(
"The condition provided must be a proper formula: ",
deparseAndFlatten(formula)
)
}
cases[[i]] <- evalLHS(formula, data)
if (!inherits(cases[[i]], c("logical", "CrunchLogicalExpr"))) {
halt(
"The left-hand side provided must be a logical or a ",
"CrunchLogicalExpr: ", dQuote(LHS_string(formula))
)
}
values[[i]] <- evalRHS(formula, data)
}
# setup NAs for as default
# check all datasets are the same and get the reference from the unique one
ds_refs <- unlist(unique(lapply(c(cases, values), datasetReference)))
if (!missing(data)) {
ds_refs <- unique(c(ds_refs, datasetReference(data)))
}
if (length(ds_refs) > 1) {
halt(
"There must be only one dataset referenced. Did you accidentally ",
"supply more than one?"
)
} else if (length(ds_refs) == 0) {
halt(
"There must be at least one crunch expression in the formulas ",
"specifying cases or use the data argument to specify a dataset."
)
}
n_rows <- nrow(CrunchDataset(crGET(ds_refs)))
# grab booleans for cases
case_indices <- lapply(cases, which)
# deduplicate indices, favoring the first true condition
case_indices <- lapply(seq_along(case_indices), function(i) {
setdiff(case_indices[[i]], unlist(case_indices[seq_len(i - 1)]))
})
# grab the values needed from source variables
values_to_fill <- Map(function(ind, var) {
if (inherits(var, c("CrunchVariable", "CrunchExpr"))) {
# grab the variable contents at inds we take inds after as.vector
# in case there is a filter applied. If the API allowed for
# returning values at specific indices, even when a dataset is
# filtered, this wouldn't be necessary (cf pivotal: #151013797)
return(as.vector(var)[ind])
} else {
# if var isn't a crunch variable or expression, just return var
return(var)
}
}, ind = case_indices, var = values)
# determine the types before collation (since factor coercion is less than
# ideal, we need to do this before we collate)
pre_collation_types <- vapply(values, class, character(1))
values <- collateValues(values_to_fill, case_indices, else_condition, n_rows)
if (all(pre_collation_types == "factor")) {
type <- "categorical"
} else if (is.numeric(values)) {
type <- "numeric"
} else {
# catch all, in case there are R types like logicals that Crunch would
# treat as character or categorical
type <- "text"
}
return(list(values = values, type = type))
}
# because factors by default coerce into their IDs, which is almost never what
# we want, we need to do some magic to collate the values together.
collateValues <- function(values_to_fill, case_indices, else_condition,
n_rows) {
result <- rep(else_condition, n_rows)
# fill values
for (i in seq_along(case_indices)) {
vals <- values_to_fill[[i]]
# change all factors to characters temporarily to avoid accidental
# coercion to ids (the default if result is not already a factor)
if (is.factor(vals)) {
vals <- as.character(vals)
}
result[case_indices[[i]]] <- vals
}
return(result)
}