From eb45de1ca522b5c1640391cbaf84694c2934a65f Mon Sep 17 00:00:00 2001 From: Mauro Lepore Date: Mon, 6 Jan 2020 15:47:26 -0600 Subject: [PATCH] 69 match name handles groups (#77) * prioritize no longer informs ignoring groups * match_name preserves groups Closes #69 Also extract roxygen template to document such behaviour. * Document prefer_perfect_match_by Closes #61 I finally decided to document this in @return instead of in a new argument. * Document --- R/match_name.R | 19 +++++++++++++++---- R/prioritize.R | 6 ++---- .../ignores-but-preserves-existing-groups.R | 2 ++ man/match_name.Rd | 18 +++++++++++++++--- man/prioritize.Rd | 5 +++++ tests/testthat/test-match_name.R | 7 +++++++ tests/testthat/test-prioritize.R | 5 +---- 7 files changed, 47 insertions(+), 15 deletions(-) create mode 100644 man-roxygen/ignores-but-preserves-existing-groups.R diff --git a/R/match_name.R b/R/match_name.R index af850e96..74ee0636 100644 --- a/R/match_name.R +++ b/R/match_name.R @@ -8,6 +8,7 @@ #' using [stringdist::stringsim()]. #' #' @template alias-assign +#' @template ignores-but-preserves-existing-groups #' #' @inherit score_alias_similarity #' @inheritParams restructure_loanbook_for_matching @@ -16,9 +17,14 @@ #' #' @family user-oriented #' -#' @return A dataframe with the same columns as the loanbook data with -#' additional columns: `id`, `sector`, `sector_ald`, `source`, -#' `alias`, `alias_ald`, `score`, `name_ald`. +#' @return A dataframe with the same groups (if any) and columns as `loanbook`, +#' and the additional columns: `id`, `sector`, `sector_ald`, `source`, +#' `alias`, `alias_ald`, `score`, `name_ald`. The returned rows depend on the +#' argument `min_value` and the result of the column `score` for each loan: +#' * If any row has `score` equal to 1, `match_name()` returns all rows where +#' `score` equals 1, dropping all other rows. +#' * If no row has `score` equal to 1, `match_name()` returns all rows where +#' `score` is equal to or greater than `min_score`. #' #' @export #' @@ -40,6 +46,9 @@ match_name <- function(loanbook, method = "jw", p = 0.1, overwrite = NULL) { + old_groups <- dplyr::groups(loanbook) + loanbook <- ungroup(loanbook) + prep_lbk <- suppressMessages( restructure_loanbook_for_matching(loanbook, overwrite = overwrite) ) @@ -61,7 +70,7 @@ match_name <- function(loanbook, level_cols <- out %>% names_matching(level = get_level_columns()) - out %>% + out <- out %>% tidyr::pivot_longer( cols = level_cols, names_to = "level_lbk", @@ -72,6 +81,8 @@ match_name <- function(loanbook, level_lbk = sub("_lbk$", "", .data$level_lbk), ) %>% remove_suffix("_lbk") + + dplyr::group_by(out, !!! old_groups) } suffix_names <- function(data, suffix, names = NULL) { diff --git a/R/prioritize.R b/R/prioritize.R index 8d571fd8..223c57b5 100644 --- a/R/prioritize.R +++ b/R/prioritize.R @@ -1,5 +1,7 @@ #' Pick rows where `score` is 1 and `level` per loan is of highest `priority` #' +#' @template ignores-but-preserves-existing-groups +#' #' @param data A dataframe, commonly the output of [match_name()]. #' @param priority One of: #' * `NULL`: defaults to the default level priority as returned by @@ -53,10 +55,6 @@ prioritize <- function(data, priority = NULL) { priority <- set_priority(data, priority = priority) old_groups <- dplyr::groups(data) - if (!is.null(old_groups)) { - message("Ignoring preexisting groups.") - } - perfect_matches <- filter(ungroup(data), .data$score == 1L) out <- perfect_matches %>% diff --git a/man-roxygen/ignores-but-preserves-existing-groups.R b/man-roxygen/ignores-but-preserves-existing-groups.R new file mode 100644 index 00000000..d3648463 --- /dev/null +++ b/man-roxygen/ignores-but-preserves-existing-groups.R @@ -0,0 +1,2 @@ +#' @section Handling grouped data: +#' This function ignores but preserves existing groups them. diff --git a/man/match_name.Rd b/man/match_name.Rd index dc7072c7..50514e53 100644 --- a/man/match_name.Rd +++ b/man/match_name.Rd @@ -38,9 +38,16 @@ columns of a particular direct loantaker or ultimate parent. To overwrite only \code{sector}, the value in the \code{name} column should be \code{NA}.} } \value{ -A dataframe with the same columns as the loanbook data with -additional columns: \code{id}, \code{sector}, \code{sector_ald}, \code{source}, -\code{alias}, \code{alias_ald}, \code{score}, \code{name_ald}. +A dataframe with the same groups (if any) and columns as \code{loanbook}, +and the additional columns: \code{id}, \code{sector}, \code{sector_ald}, \code{source}, +\code{alias}, \code{alias_ald}, \code{score}, \code{name_ald}. The returned rows depend on the +argument \code{min_value} and the result of the column \code{score} for each loan: +\itemize{ +\item If any row has \code{score} equal to 1, \code{match_name()} returns all rows where +\code{score} equals 1, dropping all other rows. +\item If no row has \code{score} equal to 1, \code{match_name()} returns all rows where +\code{score} is equal to or greater than \code{min_score}. +} } \description{ \code{match_name()} scores the match between names in a loanbook dataset (columns @@ -63,6 +70,11 @@ commonly used in name matching algorithms: } } +\section{Handling grouped data}{ + +This function ignores but preserves existing groups them. +} + \examples{ library(dplyr) library(r2dii.dataraw) diff --git a/man/prioritize.Rd b/man/prioritize.Rd index 1af3c91c..2f5f3760 100644 --- a/man/prioritize.Rd +++ b/man/prioritize.Rd @@ -25,6 +25,11 @@ priority level is highest. \description{ Pick rows where \code{score} is 1 and \code{level} per loan is of highest \code{priority} } +\section{Handling grouped data}{ + +This function ignores but preserves existing groups them. +} + \examples{ library(dplyr) diff --git a/tests/testthat/test-match_name.R b/tests/testthat/test-match_name.R index ff3d781e..5886ec74 100644 --- a/tests/testthat/test-match_name.R +++ b/tests/testthat/test-match_name.R @@ -164,3 +164,10 @@ test_that("match_name()$level lacks prefixf 'name_' suffix '_lbk'", { any(endsWith(unique(out$level), "_lbk")) ) }) + +test_that("match_name preserves groups", { + grouped_loanbook <- slice(loanbook_demo, 4:5) %>% + group_by(id_loan) + + expect_true(is_grouped_df(match_name(grouped_loanbook, ald_demo))) +}) diff --git a/tests/testthat/test-prioritize.R b/tests/testthat/test-prioritize.R index 76ab82fa..d8fb744d 100644 --- a/tests/testthat/test-prioritize.R +++ b/tests/testthat/test-prioritize.R @@ -145,10 +145,7 @@ test_that("prioritize previous preserves groups", { group_by(other_id, score) # styler: on - expect_message( - out <- prioritize(matched, priority = "z"), - "[Ii]gnor.*group" - ) + out <- prioritize(matched, priority = "z") expect_true(dplyr::is_grouped_df(out)) expect_equal(dplyr::group_vars(out), c("other_id", "score")) })