From eb45de1ca522b5c1640391cbaf84694c2934a65f Mon Sep 17 00:00:00 2001
From: Mauro Lepore <maurolepore@gmail.com>
Date: Mon, 6 Jan 2020 15:47:26 -0600
Subject: [PATCH] 69 match name handles groups (#77)

* prioritize no longer informs ignoring groups

* match_name preserves groups

Closes #69

Also extract roxygen template to document such behaviour.

* Document prefer_perfect_match_by

Closes #61

I finally decided to document this in @return instead of in a new
argument.

* Document
---
 R/match_name.R                                | 19 +++++++++++++++----
 R/prioritize.R                                |  6 ++----
 .../ignores-but-preserves-existing-groups.R   |  2 ++
 man/match_name.Rd                             | 18 +++++++++++++++---
 man/prioritize.Rd                             |  5 +++++
 tests/testthat/test-match_name.R              |  7 +++++++
 tests/testthat/test-prioritize.R              |  5 +----
 7 files changed, 47 insertions(+), 15 deletions(-)
 create mode 100644 man-roxygen/ignores-but-preserves-existing-groups.R

diff --git a/R/match_name.R b/R/match_name.R
index af850e96..74ee0636 100644
--- a/R/match_name.R
+++ b/R/match_name.R
@@ -8,6 +8,7 @@
 #' using [stringdist::stringsim()].
 #'
 #' @template alias-assign
+#' @template ignores-but-preserves-existing-groups
 #'
 #' @inherit score_alias_similarity
 #' @inheritParams restructure_loanbook_for_matching
@@ -16,9 +17,14 @@
 #'
 #' @family user-oriented
 #'
-#' @return A dataframe with the same columns as the loanbook data with
-#'   additional columns: `id`, `sector`, `sector_ald`, `source`,
-#'   `alias`, `alias_ald`, `score`, `name_ald`.
+#' @return A dataframe with the same groups (if any) and columns as `loanbook`,
+#'   and the additional columns: `id`, `sector`, `sector_ald`, `source`,
+#'   `alias`, `alias_ald`, `score`, `name_ald`. The returned rows depend on the
+#'   argument `min_value` and the result of the column `score` for each loan:
+#'   * If any row has `score` equal to 1, `match_name()` returns all rows where
+#'   `score` equals 1, dropping all other rows.
+#'   * If no row has `score` equal to 1, `match_name()` returns all rows where
+#'   `score` is equal to or greater than `min_score`.
 #'
 #' @export
 #'
@@ -40,6 +46,9 @@ match_name <- function(loanbook,
                        method = "jw",
                        p = 0.1,
                        overwrite = NULL) {
+  old_groups <- dplyr::groups(loanbook)
+  loanbook <- ungroup(loanbook)
+
   prep_lbk <- suppressMessages(
     restructure_loanbook_for_matching(loanbook, overwrite = overwrite)
   )
@@ -61,7 +70,7 @@ match_name <- function(loanbook,
   level_cols <- out %>%
     names_matching(level = get_level_columns())
 
-  out %>%
+  out <- out %>%
     tidyr::pivot_longer(
       cols = level_cols,
       names_to = "level_lbk",
@@ -72,6 +81,8 @@ match_name <- function(loanbook,
       level_lbk = sub("_lbk$", "", .data$level_lbk),
     ) %>%
     remove_suffix("_lbk")
+
+  dplyr::group_by(out, !!! old_groups)
 }
 
 suffix_names <- function(data, suffix, names = NULL) {
diff --git a/R/prioritize.R b/R/prioritize.R
index 8d571fd8..223c57b5 100644
--- a/R/prioritize.R
+++ b/R/prioritize.R
@@ -1,5 +1,7 @@
 #' Pick rows where `score` is 1 and `level` per loan is of highest `priority`
 #'
+#' @template ignores-but-preserves-existing-groups
+#'
 #' @param data A  dataframe, commonly the output of [match_name()].
 #' @param priority One of:
 #'   * `NULL`: defaults to the default level priority as returned by
@@ -53,10 +55,6 @@ prioritize <- function(data, priority = NULL) {
   priority <- set_priority(data, priority = priority)
 
   old_groups <- dplyr::groups(data)
-  if (!is.null(old_groups)) {
-    message("Ignoring preexisting groups.")
-  }
-
   perfect_matches <- filter(ungroup(data), .data$score == 1L)
 
   out <- perfect_matches %>%
diff --git a/man-roxygen/ignores-but-preserves-existing-groups.R b/man-roxygen/ignores-but-preserves-existing-groups.R
new file mode 100644
index 00000000..d3648463
--- /dev/null
+++ b/man-roxygen/ignores-but-preserves-existing-groups.R
@@ -0,0 +1,2 @@
+#' @section Handling grouped data:
+#' This function ignores but preserves existing groups them.
diff --git a/man/match_name.Rd b/man/match_name.Rd
index dc7072c7..50514e53 100644
--- a/man/match_name.Rd
+++ b/man/match_name.Rd
@@ -38,9 +38,16 @@ columns of a particular direct loantaker or ultimate parent. To overwrite
 only \code{sector}, the value in the \code{name} column should be \code{NA}.}
 }
 \value{
-A dataframe with the same columns as the loanbook data with
-additional columns: \code{id}, \code{sector}, \code{sector_ald}, \code{source},
-\code{alias}, \code{alias_ald}, \code{score}, \code{name_ald}.
+A dataframe with the same groups (if any) and columns as \code{loanbook},
+and the additional columns: \code{id}, \code{sector}, \code{sector_ald}, \code{source},
+\code{alias}, \code{alias_ald}, \code{score}, \code{name_ald}. The returned rows depend on the
+argument \code{min_value} and the result of the column \code{score} for each loan:
+\itemize{
+\item If any row has \code{score} equal to 1, \code{match_name()} returns all rows where
+\code{score} equals 1, dropping all other rows.
+\item If no row has \code{score} equal to 1, \code{match_name()} returns all rows where
+\code{score} is equal to or greater than \code{min_score}.
+}
 }
 \description{
 \code{match_name()} scores the match between names in a loanbook dataset (columns
@@ -63,6 +70,11 @@ commonly used in name matching algorithms:
 }
 }
 
+\section{Handling grouped data}{
+
+This function ignores but preserves existing groups them.
+}
+
 \examples{
 library(dplyr)
 library(r2dii.dataraw)
diff --git a/man/prioritize.Rd b/man/prioritize.Rd
index 1af3c91c..2f5f3760 100644
--- a/man/prioritize.Rd
+++ b/man/prioritize.Rd
@@ -25,6 +25,11 @@ priority level is highest.
 \description{
 Pick rows where \code{score} is 1 and \code{level} per loan is of highest \code{priority}
 }
+\section{Handling grouped data}{
+
+This function ignores but preserves existing groups them.
+}
+
 \examples{
 library(dplyr)
 
diff --git a/tests/testthat/test-match_name.R b/tests/testthat/test-match_name.R
index ff3d781e..5886ec74 100644
--- a/tests/testthat/test-match_name.R
+++ b/tests/testthat/test-match_name.R
@@ -164,3 +164,10 @@ test_that("match_name()$level lacks prefixf 'name_' suffix '_lbk'", {
     any(endsWith(unique(out$level), "_lbk"))
   )
 })
+
+test_that("match_name preserves groups", {
+  grouped_loanbook <- slice(loanbook_demo, 4:5) %>%
+    group_by(id_loan)
+
+  expect_true(is_grouped_df(match_name(grouped_loanbook, ald_demo)))
+})
diff --git a/tests/testthat/test-prioritize.R b/tests/testthat/test-prioritize.R
index 76ab82fa..d8fb744d 100644
--- a/tests/testthat/test-prioritize.R
+++ b/tests/testthat/test-prioritize.R
@@ -145,10 +145,7 @@ test_that("prioritize previous preserves groups", {
     group_by(other_id, score)
   # styler: on
 
-  expect_message(
-    out <- prioritize(matched, priority = "z"),
-    "[Ii]gnor.*group"
-  )
+  out <- prioritize(matched, priority = "z")
   expect_true(dplyr::is_grouped_df(out))
   expect_equal(dplyr::group_vars(out), c("other_id", "score"))
 })