-
Notifications
You must be signed in to change notification settings - Fork 7
/
recode-with-table.R
976 lines (935 loc) · 34.5 KB
/
recode-with-table.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
# Removing Note
. <- NULL
#' @title is equal
#' @description Function to compare even with NA present
#' This function returns TRUE wherever elements are the same, including NA's,
#' and false everywhere else.
#'
#' @param v1 variable 1
#' @param v2 variable 2
#'
#' @return boolean value of whether or not v1 and v2 are equal
#'
#' @examples
#' library(cchsflow)
#' is_equal(1,2)
#' # FALSE
#'
#' is_equal(1,1)
#' # TRUE
#'
#' 1==NA
#' # NA
#'
#' is_equal(1,NA)
#' # FALSE
#'
#' NA==NA
#' # NA
#'
#' is_equal(NA,NA)
#' # TRUE
#' @export
is_equal <- function(v1, v2) {
same <- (v1 == v2) | (is.na(v1) & is.na(v2))
# anything compared to NA equals NA
# replaces all instanses of NA with FALSE
same[is.na(same)] <- FALSE
return(same)
}
#' Recode with Table
#'
#' Recode with Table is responsible for recoding values of a dataset based on
#' the specifications in variable_details.
#'
#' The \href{https://github.com/Big-Life-Lab/cchsflow/blob/master/inst/extdata/variable_details.csv}{variable_details}
#' dataframe needs the following variables to function:
#' \describe{
#' \item{variable}{name of new (mutated) variable that is recoded}
#' \item{toType}{type the variable is being recoded to
#' \emph{cat = categorical, cont = continuous}}
#' \item{databaseStart}{name of dataframe with original variables to be
#' recoded}
#' \item{variableStart}{name of variable to be recoded}
#' \item{fromType}{variable type of start variable.
#' \emph{cat = categorical or factor variable}
#' \emph{cont = continuous variable (real number or integer)}}
#' \item{recTo}{Value to recode to}
#' \item{recFrom}{Value/range being recoded from}
#' }
#' Each row in \emph{variable_details} comprises one category in a
#' newly transformed variable. The rules for each category the new variable
#' are a string in \emph{recFrom} and value in \emph{recTo}.
#' These recode pairs are the same syntax as \emph{sjmisc::rec()},
#' except in \emph{sjmisc::rec()} the pairs are a string for the function
#' attribute \emph{rec =}, separated by '\emph{=}'.
#' For example in \emph{rec_w_table}
#' \emph{variable_details$recFrom = 2; variable_details$recTo = 4}
#' is the same as \emph{sjmisc::rec(rec = "2=4")}.
#' the pairs are obtained from the RecFrom and RecTo columns
#' \describe{
#' \item{recode pairs}{each recode pair is row. see above example
#' or \emph{PBC-variableDetails.csv}}
#' \item{multiple values}{multiple old values that should be recoded into
#' a new single value may be separated with comma, e.g.
#' \emph{recFrom = "1,2"; recTo = 1}}
#' \item{value range}{a value range is indicated by a colon, e.g.
#' \emph{recFrom= "1:4"; recTo = 1} (recodes all values from 1 to 4 into 1)}
#' \item{value range for doubles}{for double vectors (with fractional part),
#' all values within the specified range are recoded; e.g.
#' \emph{recFrom = "1:2.5'; recTo = 1} recodes 1 to 2.5 into 1,
#' but 2.55 would not be recoded
#' (since it's not included in the specified range)}
#' \item{\emph{"min"} and \emph{"max"}}{minimum and maximum values
#' are indicates by \emph{min} (or \emph{lo}) and \emph{max} (or \emph{hi}),
#' e.g. \emph{recFrom = "min:4"; recTo = 1} (recodes all values from
#' minimum values of \emph{x} to 4 into 1)}
#' \item{\emph{"else"}}{all other values, which have not been specified yet,
#' are indicated by \emph{else}, e.g. \emph{recFrom = "else"; recTo = NA}
#' (recode all other values (not specified in other rows) to "NA")}
#' \item{\emph{"copy"}}{the \emph{"else"}-token can be combined with
#' \emph{copy}, indicating that all remaining, not yet recoded values should
#' stay the same (are copied from the original value), e.g.
#' \emph{recFrom = "else"; recTo = "copy"}}
#' \item{\emph{NA}'s}{\emph{NA} values are allowed both as old and
#' new value, e.g.
#' \emph{recFrom "NA"; recTo = 1. or "recFrom = "3:5"; recTo = "NA"}
#' (recodes all NA into 1,
#' and all values from 3 to 5 into NA in the new variable)}
#' }
#'
#' @param data A dataframe containing the variables to be recoded. Can also be a list of dataframes
#' @param variables character vector containing variable names to recode or
#' a variables csv containing additional variable info
#' @param database_name String, the name of the dataset containing the variables
#' to be recoded. Can also be a vector of strings if data is a list
#' @param variable_details A dataframe containing the specifications (rules)
#' for recoding.
#' @param else_value Value (string, number, integer, logical or NA) that is used
#' to replace any values that are outside the specified ranges
#' (no rules for recoding).
#' @param append_to_data Logical, if \code{TRUE} (default), recoded variables
#' will be appended to the data.
#' @param log Logical, if \code{FALSE} (default), a log of recoding will
#' not be printed.
#' @param notes Logical, if \code{FALSE} (default), will not print the
#' content inside the `Note`` column of the variable being recoded.
#' @param var_labels labels vector to attach to variables in variables
#' @param custom_function_path path to location of the function to load
#' @param attach_data_name to attach name of database to end table
#'
#' @return a dataframe that is recoded according to rules in variable_details.
#'
#' @examples
#' library(cchsflow)
#' bmi2001 <- rec_with_table(
#' data = cchs2001_p, c(
#' "HWTGHTM",
#' "HWTGWTK", "HWTGBMI_der"
#' )
#' )
#'
#' head(bmi2001)
#'
#' bmi2011_2012 <- rec_with_table(
#' data = cchs2011_2012_p, c(
#' "HWTGHTM",
#' "HWTGWTK", "HWTGBMI_der"
#' )
#' )
#'
#' tail(bmi2011_2012)
#'
#' combined_bmi <- bind_rows(bmi2001, bmi2011_2012)
#' head(combined_bmi)
#' tail(combined_bmi)
#' @importFrom haven tagged_na
#' @importFrom stringr str_match
#' @importFrom dplyr rowwise select do
#' @importFrom magrittr %>%
#' @export
rec_with_table <-
function(data,
variables = NULL,
database_name = NULL,
variable_details = NULL,
else_value = NA,
append_to_data = FALSE,
log = FALSE,
notes = TRUE,
var_labels = NULL,
custom_function_path = NULL,
attach_data_name = FALSE) {
# If custom Functions are passed create new environment and source
if (!is.null(custom_function_path)) {
source(custom_function_path)
}
if (is.null(variable_details)) {
message("No variable_details detected.
Loading cchsflow variable_details")
data(variable_details, package = "cchsflow", envir = environment())
}
if (is.null(variables)) {
message("No variables detected.
Loading cchsflow variables")
data(variables, package = "cchsflow", envir = environment())
}
if (is.null(database_name)) {
message("Using the passed data variable name as database_name")
database_name <- deparse(substitute(data))
}
# ---- Step 1: Detemine if the passed data is a list or single database
append_non_db_columns <- FALSE
if (is.list(data) &&
length(database_name) == length(data)) {
for (data_name in database_name) {
# ---- Step 2A: Verify that the passed name exists in the passed data
if (!is.null(data[[data_name]])) {
data[[data_name]] <- recode_call(
variables = variables,
data = data[[data_name]],
database_name = database_name,
print_note = notes,
else_value = else_value,
variable_details = variable_details,
append_to_data = append_to_data,
append_non_db_columns = append_non_db_columns,
log = log,
var_labels = var_labels
)
} else {
stop(
paste(
"The data",
data_name,
"is missing from the passed list please verify the names are
correct in the data list and the database_name list"
)
)
}
}
} else if ("data.frame" %in% class(data) &&
length(database_name) == 1) {
data <- recode_call(
variables = variables,
data = data,
database_name = database_name,
print_note = notes,
else_value = else_value,
variable_details = variable_details,
append_to_data = append_to_data,
append_non_db_columns = append_non_db_columns,
log = log,
var_labels = var_labels
)
if (attach_data_name) {
data[["data_name"]] <- database_name
}
} else {
stop(
paste(
"The passed number of data does not match the passed number of
data_names. Please verify that the number of databases matches the number
of passed names.
Aborting operation!"
),
call. = FALSE
)
}
return(data)
}
# Creates inputs and runs recode functions
recode_call <-
function(variables,
data,
database_name,
print_note,
else_value,
variable_details,
append_to_data,
append_non_db_columns,
log,
var_labels) {
variable_details[[pkg.globals$argument.Variables]] <-
trimws(variable_details[[pkg.globals$argument.Variables]])
if (!is.null(variables) && "data.frame" %in% class(variables)) {
variables[[pkg.globals$argument.Variables]] <-
trimws(variables[[pkg.globals$argument.Variables]])
variable_details <-
update_variable_details_based_on_variable_sheet(
variable_sheet = variables,
variable_details = variable_details
)
} else {
if (!is.null(variables)) {
variable_details <-
variable_details[trimws(variable_details[[
pkg.globals$argument.Variables]]) %in% variables, ]
vars_being_recoded <-
as.character(unique(variable_details[[
pkg.globals$argument.Variables]]))
if (length(vars_being_recoded) != length(variables)) {
missing_vars <- setdiff(variables, vars_being_recoded)
warning(
paste(
missing_vars,
"is missing from variable details therefore cannot be recoded"
)
)
}
}
if (is.null(variable_details[[pkg.globals$argument.VariableLabel]])) {
variable_details[[pkg.globals$argument.VariableLabel]] <- NA
}
if (is.null(variable_details[[
pkg.globals$argument.VariableLabelShort]])) {
variable_details[[pkg.globals$argument.VariableLabelShort]] <- NA
}
}
if (!is.null(var_labels)) {
if (is.null(names(var_labels))) {
stop(
"The passed labels was not a named vector please follow
the c(var_name = varLalbel) format"
)
} else {
if (is.factor(variable_details[[
pkg.globals$argument.VariableLabelShort]])) {
variable_details[[pkg.globals$argument.VariableLabelShort]] <-
as.character(variable_details[[
pkg.globals$argument.VariableLabelShort]])
}
for (var_name in names(var_labels)) {
variable_details[variable_details[[
pkg.globals$argument.Variables]] == var_name,
pkg.globals$argument.VariableLabelShort] <-
var_labels[[var_name]]
}
}
}
all_possible_var_names <-
unique(as.character(variable_details[[pkg.globals$argument.Variables]]))
all_variables_detected <-
variable_details[grepl(database_name, variable_details[[
pkg.globals$argument.DatabaseStart]]), ]
rec_data <-
recode_columns(
data = data,
variables_to_process = all_variables_detected,
data_name = database_name,
log = log,
print_note = print_note,
else_default = else_value
)
if (append_non_db_columns) {
missed_variables <-
all_possible_var_names[!all_possible_var_names %in%
unique(as.character(
all_variables_detected[
,
pkg.globals$argument.Variables]))]
for (missed_variable_name in missed_variables) {
rec_data[[missed_variable_name]] <- NA
}
}
if (append_to_data) {
data <- cbind(data, rec_data)
} else {
data <- rec_data
}
return(data)
}
#' @title Get Data Variable Name
#'
#' @name get_data_variable_name
#'
#' @description Retrieves the name of the column inside data to
#' use for calculations
#'
#' @param data_name name of the database being checked
#' @param data database being checked
#' @param row_being_checked the row from variable details that contains
#' information on this variable
#' @param variable_being_checked the name of the recoded variable
#'
#' @return the data equivalent of variable_being_checked
get_data_variable_name <-
function(data_name,
data,
row_being_checked,
variable_being_checked) {
data_variable_being_checked <- character()
var_start_names <-
as.character(row_being_checked[[pkg.globals$argument.VariableStart]])
if (grepl(data_name, var_start_names)) {
var_start_names_list <- as.list(strsplit(var_start_names, ",")[[1]])
# Find exact var Name
for (var_name in var_start_names_list) {
if (grepl(data_name, var_name)) {
# seperate dataname from the var name
data_variable_being_checked <-
as.list(strsplit(var_name, "::")[[1]])[[2]]
}
}
# Check for default variable name
} else if (grepl("\\[", var_start_names)) {
# Strip default var name tags: []
data_variable_being_checked <-
str_match(var_start_names, "\\[(.*?)\\]")[, 2]
} else {
stop(
paste(
"The row
",
row,
"for the variable",
variable_being_checked,
"
Does not contain the database being checked(",
data_name,
") in its variable start the default is also missing.
Please double check if this variable should have this",
data_name,
"included in its databaseStart"
)
)
}
data_variable_being_checked <- trimws(data_variable_being_checked)
return(data_variable_being_checked)
}
#' recode_columns
#'
#' Recodes columns from passed row and returns just table with those columns
#' and same rows as the data
#'
#' @param data The source database
#' @param variables_to_process rows from variable details that are applicable
#' to this DB
#' @param data_name Name of the database being passed
#' @param log The option of printing log
#' @param print_note the option of printing the note columns
#' @param else_default default else value to use if no else is present
#'
#' @return Returns recoded and labeled data
recode_columns <-
function(data,
variables_to_process,
data_name,
log,
print_note,
else_default) {
# Split variables to process into recode map and func
map_variables_to_process <-
variables_to_process[grepl("map::", variables_to_process[[
pkg.globals$argument.CatValue]]), ]
func_variables_to_process <-
variables_to_process[grepl("DerivedVar::", variables_to_process[[
pkg.globals$argument.VariableStart]]), ]
rec_variables_to_process <-
variables_to_process[(!grepl("Func::|map::", variables_to_process[[
pkg.globals$argument.CatValue]])) & (!grepl("DerivedVar::",
variables_to_process[[
pkg.globals$argument.VariableStart]])), ]
label_list <- list()
# Set interval if none is present
valid_intervals <- c("[,]", "[,)", "(,]")
interval_default <- "[,]"
recoded_data <- data[, 0]
# Loop through the rows of recode vars
while (nrow(rec_variables_to_process) > 0) {
variable_being_checked <-
as.character(rec_variables_to_process[1,
pkg.globals$argument.Variables])
rows_being_checked <-
rec_variables_to_process[rec_variables_to_process[[
pkg.globals$argument.Variables]] == variable_being_checked, ]
rec_variables_to_process <-
rec_variables_to_process[!rec_variables_to_process[[
pkg.globals$argument.Variables]] == variable_being_checked, ]
first_row <- rows_being_checked[1, ]
# Check for varialbe existance in data
data_variable_being_checked <-
get_data_variable_name(
data_name = data_name,
row_being_checked = first_row,
variable_being_checked = variable_being_checked,
data = data
)
if (is.null(data[[data_variable_being_checked]])) {
warning(
paste(
"Data",
data_name,
"does not contain the variable",
data_variable_being_checked
)
)
} else {
# Check for From column duplicates
all_from_values_for_variable <-
rows_being_checked[[pkg.globals$argument.From]]
if (length(unique(
all_from_values_for_variable)) != length(
all_from_values_for_variable)) {
for (single_from in all_from_values_for_variable) {
# Check if value is repeated more then once
if (sum(all_from_values_for_variable == single_from) > 1) {
stop(
paste(
single_from,
"was detected more then once in",
variable_being_checked,
"please make sure only one from value is being recoded"
)
)
}
}
}
# Set factor for all recode values
label_list[[variable_being_checked]] <-
create_label_list_element(rows_being_checked)
else_value <-
as.character(rows_being_checked[rows_being_checked[[
pkg.globals$argument.From]] == "else",
pkg.globals$argument.CatValue])
if (length(else_value) == 1 &&
!is_equal(else_value, "character(0)")) {
else_value <-
recode_variable_NA_formating(else_value, label_list[[
variable_being_checked]]$type)
if (is_equal(else_value, "copy")) {
recoded_data[variable_being_checked] <-
data[data_variable_being_checked]
} else {
recoded_data[variable_being_checked] <- else_value
}
# Catch multiple else rows
} else if (length(else_value) > 1) {
stop(
paste(
variable_being_checked,
" contains",
length(else_value),
"rows of else only one else value is allowed"
)
)
}
else {
recoded_data[variable_being_checked] <- else_default
}
rows_being_checked <-
rows_being_checked[!rows_being_checked[[
pkg.globals$argument.From]] == "else", ]
if (nrow(rows_being_checked) > 0) {
log_table <- rows_being_checked[, 0]
log_table$value_to <- NA
log_table$From <- NA
log_table$rows_recoded <- NA
levels(recoded_data[[variable_being_checked]]) <-
c(levels(recoded_data[[variable_being_checked]]),
levels(rows_being_checked[[pkg.globals$argument.CatValue]]))
for (row in seq_len(nrow(rows_being_checked))) {
row_being_checked <- rows_being_checked[row, ]
# If cat go check for label and obtain it
# regardless obtain unit and attach
# find var name for this database
data_variable_being_checked <-
get_data_variable_name(
data_name = data_name,
row_being_checked = row_being_checked,
variable_being_checked = variable_being_checked,
data = data
)
# Recode the variable
from_values <- list()
interval <- ""
if (grepl("\\[*\\]", as.character(row_being_checked[[
pkg.globals$argument.From]])) ||
grepl("\\(*\\)", as.character(row_being_checked[[
pkg.globals$argument.From]]))) {
from_values <-
strsplit(as.character(row_being_checked[[
pkg.globals$argument.From]]), ",")[[1]]
from_values[[1]] <- trimws(from_values[[1]])
from_values[[2]] <- trimws(from_values[[2]])
interval_left <- substr(from_values[[1]], 1, 1)
interval_right <- substr(from_values[[2]], nchar(from_values[[2]]), nchar(from_values[[2]]))
interval <- paste0(interval_left,",",interval_right)
if(grepl("\\[", from_values[[1]])){
from_values[[1]] <- gsub("\\[", "", from_values[[1]])
}
else{
from_values[[1]] <- gsub("\\(", "", from_values[[1]])
}
if(grepl("\\]", from_values[[2]])){
from_values[[2]] <- gsub("|]", "", from_values[[2]])
}
else{
from_values[[2]] <- gsub("|)", "", from_values[[2]])
}
}
else {
temp_from <-
as.character(row_being_checked[[pkg.globals$argument.From]])
from_values[[1]] <- temp_from
from_values[[2]] <- from_values[[1]]
}
value_recorded <-
as.character(row_being_checked[[pkg.globals$argument.CatValue]])
if (from_values[[1]] == from_values[[2]]) {
interval <- "[,]"
}else if (!interval %in% valid_intervals) {
message(paste("For variable", variable_being_checked, "invalid interval was passed.\nDefault interval will be used:", interval_default))
interval <- interval_default
}
valid_row_index <- compare_value_based_on_interval(
compare_columns = data_variable_being_checked,
data = data,
left_boundary = from_values[[1]],
right_boundary = from_values[[2]],
interval = interval
)
# Start construction of dataframe for log
log_table[row, "value_to"] <- value_recorded
log_table[row, "From"] <-
as.character(row_being_checked[[pkg.globals$argument.From]])
log_table[row, "rows_recoded"] <-
sum(valid_row_index, na.rm = TRUE)
value_recorded <-
recode_variable_NA_formating(value_recorded, label_list[[
variable_being_checked]]$type)
if (is_equal(value_recorded, "copy")) {
value_recorded <-
data[valid_row_index, data_variable_being_checked]
}
recoded_data[valid_row_index, variable_being_checked] <-
value_recorded
if (print_note &&
!is.null(row_being_checked[[pkg.globals$argument.Notes]]) &&
!is_equal(row_being_checked[[pkg.globals$argument.Notes]],
"") &&
!is.na(row_being_checked[[pkg.globals$argument.Notes]])) {
message("NOTE for ", variable_being_checked,
": ",
as.character(row_being_checked[[
pkg.globals$argument.Notes]]))
}
}
# if log was requested print it
if (log) {
message(
"The variable ",
data_variable_being_checked,
" was recoded into ",
variable_being_checked,
" for the database ",
data_name,
" the following recodes were made: "
)
# Reset rowCount to avoid confusion
rownames(log_table) <- NULL
print(log_table)
}
}
}
}
# Process funcVars
while (nrow(func_variables_to_process) > 0) {
first_row <- func_variables_to_process[1, ]
first_row_variable_name <-
as.character(first_row[[pkg.globals$argument.Variables]])
# get name of var pass to
derived_return <-
recode_derived_variables(
variable_being_processed = first_row_variable_name,
recoded_data = recoded_data,
variables_to_process = func_variables_to_process,
log = log,
print_note = print_note,
else_default = else_default,
label_list = label_list,
var_stack = c()
)
label_list <- derived_return$label_list
recoded_data <- derived_return$recoded_data
func_variables_to_process <-
derived_return$variables_to_process
}
# Populate data Labels
recoded_data <-
label_data(label_list = label_list, data_to_label = recoded_data)
return(recoded_data)
}
#' Compare Value Based On Interval
#'
#' Compare values on the scientific notation interval
#'
#' @param left_boundary the min value
#' @param right_boundary the max value
#' @param data the data that contains values being compared
#' @param compare_columns The columns inside data being checked
#' @param interval The scientific notation interval
#'
#' @return a boolean vector containing true for rows where the
#' comparison is true
compare_value_based_on_interval <-
function(left_boundary,
right_boundary,
data,
compare_columns,
interval) {
return_boolean <- vector()
if (suppressWarnings(is.na(as.numeric(left_boundary)))) {
return_boolean <-
data[[compare_columns]] %in% data[[
compare_columns]][
which(left_boundary == data[[compare_columns]])]
} else {
if (interval == "[,]") {
return_boolean <-
data[[compare_columns]] %in% data[[
compare_columns]][which(
as.numeric(left_boundary) <= data[[compare_columns]] &
data[[compare_columns]] <= as.numeric(right_boundary)
)]
} else if (interval == "[,)") {
return_boolean <-
data[[compare_columns]] %in% data[[
compare_columns]][which(
as.numeric(left_boundary) <= data[[compare_columns]] &
data[[compare_columns]] < as.numeric(right_boundary)
)]
} else if (interval == "(,]") {
return_boolean <-
data[[compare_columns]] %in% data[[
compare_columns]][which(
as.numeric(left_boundary) < data[[compare_columns]] &
data[[compare_columns]] <= as.numeric(right_boundary)
)]
}
else if (interval == "(,)") {
return_boolean <-
data[[compare_columns]] %in% data[[
compare_columns]][which(
as.numeric(left_boundary) < data[[compare_columns]] &
data[[compare_columns]] < as.numeric(right_boundary)
)]
}
else {
stop("Invalid Argument was passed")
}
}
return(return_boolean)
}
# Parse out variables csv
update_variable_details_based_on_variable_sheet <-
function(variable_sheet, variable_details) {
# remove conflicting columns from variable details
variable_details <-
variable_details[, !(
names(variable_details) %in% c(
pkg.globals$MSW.Variables.Columns.VariableType,
pkg.globals$MSW.Variables.Columns.Label,
pkg.globals$MSW.Variables.Columns.LabelLong,
pkg.globals$MSW.Variables.Columns.Units
)
)]
# Only keep the needed columns
variable_sheet <-
variable_sheet[, c(
pkg.globals$MSW.Variables.Columns.Variable,
pkg.globals$MSW.Variables.Columns.VariableType,
pkg.globals$MSW.Variables.Columns.Label,
pkg.globals$MSW.Variables.Columns.LabelLong,
pkg.globals$MSW.Variables.Columns.Units
)]
# merge the labels and data
variable_details <-
merge(
variable_details,
variable_sheet,
by.x = pkg.globals$argument.Variables,
by.y = pkg.globals$MSW.Variables.Columns.Variable,
all.x = TRUE
)
# remove variables not present in variable_sheet
variable_details <-
variable_details[variable_details[[pkg.globals$argument.Variables]] %in%
variable_sheet[[
pkg.globals$MSW.Variables.Columns.Variable]], ]
return(variable_details)
}
#' Recode NA formatting
#'
#' Recodes the NA depending on the var type
#'
#' @param cell_value The value inside the recTo column
#' @param var_type the toType of a variable
#'
#' @return an appropriately coded tagged NA
recode_variable_NA_formating <- function(cell_value, var_type) {
recode_value <- NULL
if (grepl("NA", cell_value)) {
na_value_list <- strsplit(cell_value, ":")[[1]]
if (is_equal(var_type, pkg.globals$argument.CatType)) {
recode_value <- paste("NA(", na_value_list[[3]], ")", sep = "")
} else {
recode_value <- tagged_na(as.character(na_value_list[[3]]))
}
} else {
if (!is_equal(var_type, pkg.globals$argument.CatType) &&
!is_equal(cell_value, "copy")) {
cell_value <- as.numeric(cell_value)
}
recode_value <- cell_value
}
return(recode_value)
}
recode_derived_variables <-
function(recoded_data,
variable_being_processed,
variables_to_process,
log,
print_note,
else_default,
label_list,
var_stack) {
if (nrow(variables_to_process) <= 0) {
stop(paste(
variable_being_processed,
"is missing from variable_details"
))
}
var_stack <- c(var_stack, variable_being_processed)
# obtain rows to process and updated variables to Process
variable_rows <-
variables_to_process[variables_to_process[[
pkg.globals$argument.Variables]] == variable_being_processed, ]
fun_variable_rows <-
variable_rows[grepl("Func::", variable_rows[[
pkg.globals$argument.CatValue]]), ]
variables_to_process <-
variables_to_process[variables_to_process[[
pkg.globals$argument.Variables]] != variable_being_processed, ]
for (row_num in seq_len(nrow(fun_variable_rows))) {
# Check for presence of feeder variables in data and in the
# variable being processed stack
feeder_vars <-
as.list(strsplit(as.character(variable_rows[row_num, ][[
pkg.globals$argument.VariableStart]]), "::"))[[1]][[2]]
feeder_vars <- gsub("\\[|\\]", "", feeder_vars)
feeder_vars <- as.list(strsplit(feeder_vars, ","))[[1]]
feeder_vars <- sapply(feeder_vars, trimws)
used_feeder_vars <- feeder_vars
feeder_vars <- setdiff(feeder_vars, names(recoded_data))
# Check if the variable has a function to recode
non_func_missing_variables <-
setdiff(feeder_vars, unique(as.character(variables_to_process[[
pkg.globals$argument.Variables]])))
if (length(non_func_missing_variables) > 0) {
warning(
paste(
variable_being_processed,
"could not be derived because",
feeder_vars,
"was never specified nor is it a function variable,
therefore it was not recoded \n"
)
)
var_stack <-
var_stack[!(var_stack == variable_being_processed)]
return(
list(
var_stack = var_stack,
label_list = label_list,
recoded_data = recoded_data,
variables_to_process = variables_to_process
)
)
}
# Check for presense in var_stack
if (length(intersect(feeder_vars, var_stack)) > 0) {
conflict_vars <- intersect(feeder_vars, var_stack)
stop(
paste(
conflict_vars,
"is required to create",
variable_being_processed,
"but",
variable_being_processed,
"is needed to create",
conflict_vars
)
)
}
# Update var_stack and recurse to get the feeder vars
for (one_feeder in feeder_vars) {
# Need to check recoded data again in case a recursion added it
if (!one_feeder %in% names(recoded_data)) {
derived_return <-
recode_derived_variables(
variable_being_processed = one_feeder,
recoded_data = recoded_data,
variables_to_process = variables_to_process,
log = log,
print_note = print_note,
else_default = else_default,
label_list = label_list,
var_stack = var_stack
)
var_stack <- derived_return$var_stack
label_list <- derived_return$label_list
recoded_data <- derived_return$recoded_data
variables_to_process <-
derived_return$variables_to_process
}
}
# Obtain the function for each row
row_being_checked <- variable_rows[row_num, ]
func_cell <-
as.character(row_being_checked[[pkg.globals$argument.CatValue]])
function_being_used <-
as.list(strsplit(func_cell, "::"))[[1]][[2]]
column_value <-
recoded_data %>%
rowwise() %>%
select(used_feeder_vars) %>%
do(
column_being_added = calculate_custom_function_row_value(
.,
variable_names = used_feeder_vars,
custom_function_name = function_being_used
)
)
# Set type of var
if (as.character(row_being_checked[[pkg.globals$argument.ToType]]) !=
pkg.globals$argument.CatType) {
column_value <- as.numeric(unlist(column_value[["column_being_added"]]))
}else{
column_value <- as.factor(unlist(column_value[["column_being_added"]]))
}
recoded_data[[variable_being_processed]] <-
column_value
var_stack <-
var_stack[!(var_stack == variable_being_processed)]
}
label_list[[as.character(variable_being_processed)]] <-
assign(variable_being_processed, create_label_list_element(variable_rows))
return(
list(
var_stack = var_stack,
label_list = label_list,
recoded_data = recoded_data,
variables_to_process = variables_to_process
)
)
}
calculate_custom_function_row_value <-
function(row_values,
variable_names,
custom_function_name) {
row_values <- unname(row_values)
custom_function_return_value <-
do.call(get(custom_function_name), row_values)
return(custom_function_return_value)
}