Skip to content
This repository has been archived by the owner on Sep 30, 2022. It is now read-only.

Commit

Permalink
Merge 9eb2ee4 into 4ecd34e
Browse files Browse the repository at this point in the history
  • Loading branch information
maurolepore committed Dec 30, 2019
2 parents 4ecd34e + 9eb2ee4 commit 9eaac1e
Show file tree
Hide file tree
Showing 6 changed files with 88 additions and 32 deletions.
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
Package: r2dii.match
Title: Match Loanbook with Asset Level Data
Version: 0.0.0.9001
Version: 0.0.0.9002
Authors@R:
c(person(given = "Mauro",
family = "Lepore",
Expand Down
2 changes: 2 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -12,13 +12,15 @@ export(string_similarity)
export(uniquify_id_column)
importFrom(dplyr,distinct)
importFrom(dplyr,filter)
importFrom(dplyr,group_by)
importFrom(dplyr,if_else)
importFrom(dplyr,left_join)
importFrom(dplyr,mutate)
importFrom(dplyr,rename)
importFrom(dplyr,select)
importFrom(dplyr,tibble)
importFrom(dplyr,tribble)
importFrom(dplyr,ungroup)
importFrom(magrittr,"%>%")
importFrom(purrr,reduce)
importFrom(r2dii.utils,check_crucial_names)
Expand Down
2 changes: 1 addition & 1 deletion R/imports.R
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#' @importFrom r2dii.utils check_crucial_names
#' @importFrom dplyr tibble tribble if_else rename distinct
#' @importFrom dplyr filter select mutate left_join
#' @importFrom dplyr filter select mutate left_join group_by ungroup
#' @importFrom purrr reduce
#' @importFrom rlang has_name set_names
NULL
Expand Down
19 changes: 18 additions & 1 deletion R/match_name.R
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,8 @@ match_name <- function(loanbook,
matched %>%
pick_min_score(min_score) %>%
restore_cols_sector_name_and_others(prep_lbk, prep_ald) %>%
restore_cols_from_loanbook(loanbook)
restore_cols_from_loanbook(loanbook) %>%
prefer_perfect_match_by(.data$simpler_name_lbk)
}

suffix_names <- function(data, suffix, names = NULL) {
Expand Down Expand Up @@ -107,3 +108,19 @@ restore_cols_from_loanbook <- function(matched, loanbook) {
by = paste0(level_cols, "_lbk")
)
}

prefer_perfect_match_by <- function(data, ...) {
data %>%
group_by(...) %>%
filter(none_is_one(.data$score) | some_is_one(.data$score)) %>%
ungroup()
}

none_is_one <- function(x) {
all(x != 1L)
}

some_is_one <- function(x) {
any(x == 1L) & x == 1L
}

56 changes: 28 additions & 28 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -36,12 +36,12 @@ library(r2dii.match)
library(r2dii.dataraw)
#> Loading required package: r2dii.utils
library(tidyverse)
#> -- Attaching packages ----------------------------------- tidyverse 1.3.0 --
#> -- Attaching packages ------------------------------------------------------- tidyverse 1.3.0 --
#> <U+2713> ggplot2 3.2.1 <U+2713> purrr 0.3.3
#> <U+2713> tibble 2.1.3 <U+2713> dplyr 0.8.3
#> <U+2713> tidyr 1.0.0 <U+2713> stringr 1.4.0
#> <U+2713> readr 1.3.1 <U+2713> forcats 0.4.0
#> -- Conflicts -------------------------------------- tidyverse_conflicts() --
#> -- Conflicts ---------------------------------------------------------- tidyverse_conflicts() --
#> x dplyr::filter() masks stats::filter()
#> x dplyr::lag() masks stats::lag()
```
Expand Down Expand Up @@ -162,20 +162,20 @@ best-practices commonly used in name matching algorithms, such as:

``` r
match_name(your_loanbook, your_ald)
#> # A tibble: 833 x 27
#> # A tibble: 450 x 27
#> simpler_name_lbk simpler_name_ald score id_lbk sector_lbk source_lbk name_ald
#> <chr> <chr> <dbl> <chr> <chr> <chr> <chr>
#> 1 astonmartin astonmartin 1 UP23 automotive loanbook aston m…
#> 2 avtozaz avtozaz 1 UP25 automotive loanbook avtozaz
#> 3 bogdan bogdan 1 UP36 automotive loanbook bogdan
#> 4 chauto chauto 1 UP52 automotive loanbook ch auto
#> 5 chauto chtcauto 0.867 UP52 automotive loanbook chtc au…
#> 6 chehejia chehejia 1 UP53 automotive loanbook chehejia
#> 7 chtcauto chauto 0.867 UP58 automotive loanbook ch auto
#> 8 chtcauto chtcauto 1 UP58 automotive loanbook chtc au
#> 9 dongfenghonda dongfenghonda 1 UP80 automotive loanbook dongfen
#> 10 dongfenghonda dongfengluxgen 0.867 UP80 automotive loanbook dongfen
#> # … with 823 more rows, and 20 more variables: sector_ald <chr>,
#> 1 astonmartin astonmartin 1 UP23 automotive loanbook aston m…
#> 2 avtozaz avtozaz 1 UP25 automotive loanbook avtozaz
#> 3 bogdan bogdan 1 UP36 automotive loanbook bogdan
#> 4 chauto chauto 1 UP52 automotive loanbook ch auto
#> 5 chehejia chehejia 1 UP53 automotive loanbook chehejia
#> 6 chtcauto chtcauto 1 UP58 automotive loanbook chtc au…
#> 7 dongfenghonda dongfenghonda 1 UP80 automotive loanbook dongfen…
#> 8 dongfengluxgen dongfengluxgen 1 UP79 automotive loanbook dongfen
#> 9 electricmobilit… electricmobilit… 1 UP89 automotive loanbook electri
#> 10 faradayfuture faradayfuture 1 UP94 automotive loanbook faraday
#> # … with 440 more rows, and 20 more variables: sector_ald <chr>,
#> # name_ultimate_parent_lbk <chr>, name_direct_loantaker_lbk <chr>,
#> # id_loan_lbk <chr>, id_direct_loantaker_lbk <chr>,
#> # id_intermediate_parent_1_lbk <chr>, name_intermediate_parent_1_lbk <chr>,
Expand All @@ -195,20 +195,20 @@ computation time, and the number of nonsensical matches.

``` r
match_name(your_loanbook, your_ald, by_sector = FALSE)
#> # A tibble: 1,101 x 27
#> # A tibble: 658 x 27
#> simpler_name_lbk simpler_name_ald score id_lbk sector_lbk source_lbk name_ald
#> <chr> <chr> <dbl> <chr> <chr> <chr> <chr>
#> 1 abahydropowerge… abahydropowerge… 1 UP1 power loanbook aba hyd…
#> 2 achinskyglinozi… achinskyglinozi… 1 UP2 cement loanbook achinsk…
#> 3 affinityrenewab… affinityrenewab… 1 UP3 power loanbook affinit…
#> 4 africaoil corp africaoil corp 1 C2 oil and g… loanbook africa …
#> 5 africaoil corp africonshp sa 0.812 C2 oil and g… loanbook africon…
#> 6 africonshp sa africaoil corp 0.812 UP4 shipping loanbook africa
#> 7 africonshp sa africonshp sa 1 UP4 shipping loanbook africon
#> 8 agnisteelspriva… agnisteelspriva… 1 UP5 power loanbook agni st
#> 9 agrenewables agrenewables 1 UP6 power loanbook agrenew
#> 10 airasiaxbhd airasiaxbhd 1 C3 aviation loanbook airasia
#> # … with 1,091 more rows, and 20 more variables: sector_ald <chr>,
#> 1 abahydropowerge… abahydropowerge… 1 UP1 power loanbook aba hyd…
#> 2 achinskyglinozi… achinskyglinozi… 1 UP2 cement loanbook achinsk…
#> 3 affinityrenewab… affinityrenewab… 1 UP3 power loanbook affinit…
#> 4 africaoil corp africaoil corp 1 C2 oil and g… loanbook africa …
#> 5 africonshp sa africonshp sa 1 UP4 shipping loanbook africon…
#> 6 agnisteelspriva… agnisteelspriva… 1 UP5 power loanbook agni st
#> 7 agrenewables agrenewables 1 UP6 power loanbook agrenew
#> 8 airasiaxbhd airasiaxbhd 1 C3 aviation loanbook airasia
#> 9 airasiaxbhd airasiaxbhd 1 UP7 aviation loanbook airasia
#> 10 airbaltic airbaltic 1 C4 aviation loanbook airbalt
#> # … with 648 more rows, and 20 more variables: sector_ald <chr>,
#> # name_ultimate_parent_lbk <chr>, name_direct_loantaker_lbk <chr>,
#> # id_loan_lbk <chr>, id_direct_loantaker_lbk <chr>,
#> # id_intermediate_parent_1_lbk <chr>, name_intermediate_parent_1_lbk <chr>,
Expand All @@ -228,7 +228,7 @@ match_name(your_loanbook, your_ald, by_sector = FALSE)
matching_scores <- match_name(your_loanbook, your_ald, min_score = 0.9)

matching_scores
#> # A tibble: 438 x 27
#> # A tibble: 424 x 27
#> simpler_name_lbk simpler_name_ald score id_lbk sector_lbk source_lbk name_ald
#> <chr> <chr> <dbl> <chr> <chr> <chr> <chr>
#> 1 astonmartin astonmartin 1 UP23 automotive loanbook aston m…
Expand All @@ -241,7 +241,7 @@ matching_scores
#> 8 dongfengluxgen dongfengluxgen 1 UP79 automotive loanbook dongfen…
#> 9 electricmobilit… electricmobilit… 1 UP89 automotive loanbook electri…
#> 10 faradayfuture faradayfuture 1 UP94 automotive loanbook faraday…
#> # … with 428 more rows, and 20 more variables: sector_ald <chr>,
#> # … with 414 more rows, and 20 more variables: sector_ald <chr>,
#> # name_ultimate_parent_lbk <chr>, name_direct_loantaker_lbk <chr>,
#> # id_loan_lbk <chr>, id_direct_loantaker_lbk <chr>,
#> # id_intermediate_parent_1_lbk <chr>, name_intermediate_parent_1_lbk <chr>,
Expand Down
39 changes: 38 additions & 1 deletion tests/testthat/test-match_name.R
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,8 @@ test_that("match_name outputs a reasonable number of rows", {
prepare_loanbook_for_matching(loanbook_demo),
prepare_ald_for_matching(ald_demo)
) %>%
filter(score >= 0.8)
filter(score >= 0.8) %>%
prefer_perfect_match_by(.data$simpler_name_lbk)

nrows_out <- out %>%
select(names(expected)) %>%
Expand All @@ -117,3 +118,39 @@ test_that("match_name names end with _lbk or _ald, except `score`", {
"score"
)
})

test_that("match name outputs only perfect matches if any (#40 @2diiKlaus)", {
this_name <- "Nanaimo Forest Products Ltd."
this_simpler_name <- replace_customer_name(this_name)
this_lbk <- loanbook_demo %>%
filter(name_direct_loantaker == this_name)

nanimo_scores <- this_lbk %>%
match_name(ald_demo) %>%
filter(simpler_name_lbk == this_simpler_name) %>%
pull(score)

expect_true(
any(nanimo_scores == 1)
)
expect_true(
all(nanimo_scores == 1)
)
})

test_that("prefer_perfect_match_by prefers score == 1 if `var` group has any", {
# styler: off
data <- tribble(
~var, ~score,
1, 1,
2, 1,
2, 0.99,
3, 0.99,
)
# styler: on

expect_equal(
prefer_perfect_match_by(data, var),
tibble(var = c(1, 2, 3), score = c(1, 1, 0.99))
)
})

0 comments on commit 9eaac1e

Please sign in to comment.