/
R-to-variable.R
221 lines (200 loc) · 7.22 KB
/
R-to-variable.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
#' @include variable-definition.R
NULL
#' Generic method for converting objects to Crunch representations
#'
#' R objects are converted to Crunch objects using the following rules:
#'
#' - Character vectors are converted into Crunch text variables
#' - Numeric vectors are converted into Crunch numeric variables
#' - Factors are converted to categorical variables
#' - Date and POSIXt vectors are converted into Crunch datetime variables
#' - Logical vectors are converted to Crunch categorical variables
#' - [VariableDefinition()]s are not converted, but the function can still
#' append additional metadata
#'
#' If you have other object types you wish to convert to Crunch variables,
#' you can declare methods for `toVariable`.
#' @param x An R vector you want to turn into a Crunch variable
#' @param ... Additional metadata fields for the variable, such as "name" and
#' "description". See the [API documentation](
#' https://crunch.io/api/reference/#post-/datasets/-dataset_id-/variables/)
#' for a complete list of valid attributes.
#' @return A `VariableDefinition` object. To add this to a dataset, either
#' assign it into the dataset (like `ds$newvar <- toVariable(...)`) or call
#' [addVariables()]. If you're adding a column of data to a dataset, it must be
#' as long as the number of rows in the dataset, or it may be a single value to
#' be recycled for all rows.
#' @rdname toVariable
#' @aliases toVariable
#' @seealso [VariableDefinition()] [addVariables()]
#' @examples
#' var1 <- rnorm(10)
#' toVariable(var1)
#' toVariable(var1, name = "Random", description = "Generated in R")
#' \dontrun{
#' ds$random <- toVariable(var1, name = "Random")
#' # Or, this way:
#' ds <- addVariables(ds, toVariable(var1, name = "Random"))
#' }
#' @export
setGeneric("toVariable", function(x, ...) standardGeneric("toVariable"))
#' @rdname toVariable
#' @export
setMethod("toVariable", "CrunchVarOrExpr", function(x, ...) {
structure(list(derivation = zcl(x), ...), class = "VariableDefinition")
})
#' @rdname toVariable
#' @export
setMethod("toVariable", "character", function(x, ...) {
return(VariableDefinition(values = x, type = "text", ...))
})
#' @rdname toVariable
#' @export
setMethod("toVariable", "numeric", function(x, ...) {
return(VariableDefinition(values = x, type = "numeric", ...))
})
#' @rdname toVariable
#' @export
setMethod("toVariable", "factor", function(x, ...) {
return(VariableDefinition(
values = as.categorical.values(x), type = "categorical",
categories = categoriesFromLevels(levels(x)), ...
))
})
#' @rdname toVariable
#' @export
setMethod("toVariable", "Date", function(x, ...) {
return(VariableDefinition(
values = as.character(x), type = "datetime",
resolution = "D", ...
))
})
#' @rdname toVariable
#' @export
setMethod("toVariable", "POSIXt", function(x, ...) {
# R uses truncation rather than rounding for formatting ms, so numeric
# precision can cause us to be off. Get 4 digits and round them
# Upload as local time, even though it will be converted to UTC in crunch's database
# this means the "print" value of the date (eg when we get the date back, it'll be
# stored as UTC, so 1AM CST becomes 1AM UTC)
tzone <- attr(x, "tzone")
values <- strftime(x, "%Y-%m-%dT%H:%M:%OS4", tz = tzone)
values <- paste0(
substr(values, 1, 20),
as.character(round(as.numeric(substr(values, 21, 26)) / 10))
)
values[is.na(x)] <- ""
return(VariableDefinition(
values = values,
type = "datetime",
resolution = "ms", ...
))
})
#' @rdname toVariable
#' @export
setMethod("toVariable", "AsIs", function(x, ...) {
class(x) <- class(x)[-match("AsIs", class(x))]
return(toVariable(x, ...))
})
#' @rdname toVariable
#' @export
setMethod("toVariable", "VariableDefinition", function(x, ...) {
return(modifyList(x, list(...)))
})
#' @rdname toVariable
#' @export
setMethod("toVariable", "logical", function(x, ...) {
vals <- as.categorical.values(x)
cats <- .selected.cats
## Pre-3VL category names
## Note that with the extra strict definition of `is.3vl`, this won't
## register as a "logical" type yet and so as.vector will continue to return
## this as categorical, not logical
cats[[1]]$name <- "True"
cats[[2]]$name <- "False"
return(VariableDefinition(
values = vals, type = "categorical",
categories = cats, ...
))
})
# haven::haven_labelled* are S3 classes, so we have to register them the
# labelled* classes are depricated in haven 2.0, but we are keeping them here
# for backwards and forward compatibility
setOldClass("labelled")
setOldClass("haven_labelled")
haven_labelled_func <- function(x, ...) {
# TODO: what if the values are numeric? Is it possible to tell these apart
# from the labelled object?
return(toVariable(as.factor(x), ...))
}
#' @rdname toVariable
#' @export
setMethod("toVariable", "labelled", haven_labelled_func)
#' @rdname toVariable
#' @export
setMethod("toVariable", "haven_labelled", haven_labelled_func)
setOldClass("labelled_spss")
setOldClass("haven_labelled_spss")
haven_labelled_spss_func <- function(x, ...) {
# TODO: what if the values are numeric? Is it possible to tell these apart
# from the labelled object?
# convert to factor quickly (the recommended workflow for labelled objects
# from haven, since there are few methods for labelled objects)
x_factor <- as.factor(x)
categories <- categoriesFromLevels(levels(x_factor))
# grab the user missing levels
user_missings <- levels(droplevels(x_factor[is.na(x)]))
# we aren't
categories <- lapply(categories, function(cat) {
if (cat$name %in% user_missings) {
cat$missing <- TRUE
}
return(cat)
})
return(VariableDefinition(
values = as.categorical.values(x_factor),
type = "categorical",
categories = categories,
...
))
}
#' @rdname toVariable
#' @export
setMethod("toVariable", "labelled_spss", haven_labelled_spss_func)
#' @rdname toVariable
#' @export
setMethod("toVariable", "haven_labelled_spss", haven_labelled_spss_func)
as.categorical.values <- function(x) {
vals <- as.integer(x)
vals[is.na(vals)] <- -1L
return(vals)
}
#' Convert a factor's levels into Crunch categories.
#'
#' Crunch categorical variables have slightly richer metadata than R's
#' factor variables. This function generates a list of category data from
#' a factor's levels which can then be further manipulated in R before being
#' imported into Crunch.
#'
#' @param level_vect A character vector containing the levels of a factor. Usually
#' obtained by running [base::levels()]
#'
#' @return A list with each category levels id, name, numeric_value, and missingness.
#' @rdname categoriesFromLevels
#' @export
#'
#' @examples
#'
#' categoriesFromLevels(levels(iris$Species))
categoriesFromLevels <- function(level_vect) {
if (anyDuplicated(level_vect)) {
warning(
"Duplicate factor levels given: disambiguating them ",
"in translation to Categorical type"
)
level_vect <- uniquify(level_vect)
}
return(c(lapply(seq_along(level_vect), function(i) {
list(id = i, name = level_vect[i], numeric_value = i, missing = FALSE)
}), list(.no.data)))
}