69 match name handles groups (RMI-PACTA#77)

* prioritize no longer informs ignoring groups * match_name preserves groups Closes RMI-PACTA#69 Also extract roxygen template to document such behaviour. * Document prefer_perfect_match_by Closes RMI-PACTA#61 I finally decided to document this in @return instead of in a new argument. * Document
2DegreesInvesting · Jan 6, 2020 · eb45de1 · eb45de1
1 parent e7fd027
commit eb45de1
Show file tree

Hide file tree

Showing 7 changed files with 47 additions and 15 deletions.
diff --git a/R/match_name.R b/R/match_name.R
@@ -8,6 +8,7 @@
 #' using [stringdist::stringsim()].
 #'
 #' @template alias-assign
+#' @template ignores-but-preserves-existing-groups
 #'
 #' @inherit score_alias_similarity
 #' @inheritParams restructure_loanbook_for_matching
@@ -16,9 +17,14 @@
 #'
 #' @family user-oriented
 #'
-#' @return A dataframe with the same columns as the loanbook data with
-#'   additional columns: `id`, `sector`, `sector_ald`, `source`,
-#'   `alias`, `alias_ald`, `score`, `name_ald`.
+#' @return A dataframe with the same groups (if any) and columns as `loanbook`,
+#'   and the additional columns: `id`, `sector`, `sector_ald`, `source`,
+#'   `alias`, `alias_ald`, `score`, `name_ald`. The returned rows depend on the
+#'   argument `min_value` and the result of the column `score` for each loan:
+#'   * If any row has `score` equal to 1, `match_name()` returns all rows where
+#'   `score` equals 1, dropping all other rows.
+#'   * If no row has `score` equal to 1, `match_name()` returns all rows where
+#'   `score` is equal to or greater than `min_score`.
 #'
 #' @export
 #'
@@ -40,6 +46,9 @@ match_name <- function(loanbook,
                        method = "jw",
                        p = 0.1,
                        overwrite = NULL) {
+  old_groups <- dplyr::groups(loanbook)
+  loanbook <- ungroup(loanbook)
+
   prep_lbk <- suppressMessages(
     restructure_loanbook_for_matching(loanbook, overwrite = overwrite)
   )
@@ -61,7 +70,7 @@ match_name <- function(loanbook,
   level_cols <- out %>%
     names_matching(level = get_level_columns())
 
-  out %>%
+  out <- out %>%
     tidyr::pivot_longer(
       cols = level_cols,
       names_to = "level_lbk",
@@ -72,6 +81,8 @@ match_name <- function(loanbook,
       level_lbk = sub("_lbk$", "", .data$level_lbk),
     ) %>%
     remove_suffix("_lbk")
+
+  dplyr::group_by(out, !!! old_groups)
 }
 
 suffix_names <- function(data, suffix, names = NULL) {

diff --git a/R/prioritize.R b/R/prioritize.R
@@ -1,5 +1,7 @@
 #' Pick rows where `score` is 1 and `level` per loan is of highest `priority`
 #'
+#' @template ignores-but-preserves-existing-groups
+#'
 #' @param data A  dataframe, commonly the output of [match_name()].
 #' @param priority One of:
 #'   * `NULL`: defaults to the default level priority as returned by
@@ -53,10 +55,6 @@ prioritize <- function(data, priority = NULL) {
   priority <- set_priority(data, priority = priority)
 
   old_groups <- dplyr::groups(data)
-  if (!is.null(old_groups)) {
-    message("Ignoring preexisting groups.")
-  }
-
   perfect_matches <- filter(ungroup(data), .data$score == 1L)
 
   out <- perfect_matches %>%

diff --git a/man-roxygen/ignores-but-preserves-existing-groups.R b/man-roxygen/ignores-but-preserves-existing-groups.R
@@ -0,0 +1,2 @@
+#' @section Handling grouped data:
+#' This function ignores but preserves existing groups them.
diff --git a/man/match_name.Rd b/man/match_name.Rd
diff --git a/man/prioritize.Rd b/man/prioritize.Rd
diff --git a/tests/testthat/test-match_name.R b/tests/testthat/test-match_name.R
@@ -164,3 +164,10 @@ test_that("match_name()$level lacks prefixf 'name_' suffix '_lbk'", {
     any(endsWith(unique(out$level), "_lbk"))
   )
 })
+
+test_that("match_name preserves groups", {
+  grouped_loanbook <- slice(loanbook_demo, 4:5) %>%
+    group_by(id_loan)
+
+  expect_true(is_grouped_df(match_name(grouped_loanbook, ald_demo)))
+})
diff --git a/tests/testthat/test-prioritize.R b/tests/testthat/test-prioritize.R
@@ -145,10 +145,7 @@ test_that("prioritize previous preserves groups", {
     group_by(other_id, score)
   # styler: on
 
-  expect_message(
-    out <- prioritize(matched, priority = "z"),
-    "[Ii]gnor.*group"
-  )
+  out <- prioritize(matched, priority = "z")
   expect_true(dplyr::is_grouped_df(out))
   expect_equal(dplyr::group_vars(out), c("other_id", "score"))
 })