Skip to content
This repository has been archived by the owner on Sep 30, 2022. It is now read-only.

Commit

Permalink
59 remove sufix lbk (RMI-PACTA#76)
Browse files Browse the repository at this point in the history
* Remove suffix _lbk from output of match_name()

* prioritize() no longer uses suffix _lbk

* Update documentation
  • Loading branch information
maurolepore committed Jan 6, 2020
1 parent d48b0c3 commit e7fd027
Show file tree
Hide file tree
Showing 10 changed files with 163 additions and 154 deletions.
13 changes: 9 additions & 4 deletions R/match_name.R
@@ -1,4 +1,4 @@
#' Match a loanbook (lbk) and asset-level datasets (ald) by the `name_*` columns
#' Match a loanbook and asset-level datasets (ald) by the `name_*` columns
#'
#' `match_name()` scores the match between names in a loanbook dataset (columns
#' `name_direct_loantaker` and `name_ultimate_parent`) with names in an
Expand All @@ -17,8 +17,8 @@
#' @family user-oriented
#'
#' @return A dataframe with the same columns as the loanbook data with
#' additional columns: `id_lkb`, `sector_lbk`, `sector_ald`, `source_lbk`,
#' `alias_lbk`, `alias_ald`, `score`, `name_ald`.
#' additional columns: `id`, `sector`, `sector_ald`, `source`,
#' `alias`, `alias_ald`, `score`, `name_ald`.
#'
#' @export
#'
Expand Down Expand Up @@ -70,7 +70,8 @@ match_name <- function(loanbook,
mutate(
level_lbk = sub("^name_", "", .data$level_lbk),
level_lbk = sub("_lbk$", "", .data$level_lbk),
)
) %>%
remove_suffix("_lbk")
}

suffix_names <- function(data, suffix, names = NULL) {
Expand All @@ -90,6 +91,10 @@ suffix_some_names <- function(data, suffix, names) {
rename(data, !!newnames_oldnames)
}

remove_suffix <- function(data, suffix) {
set_names(data, ~ sub(suffix, "", .x))
}

pick_min_score <- function(data, min_score) {
data %>%
filter(.data$score >= min_score) %>%
Expand Down
16 changes: 8 additions & 8 deletions R/prioritize.R
Expand Up @@ -21,7 +21,7 @@
#' library(dplyr)
#'
#' matched <- tribble(
#' ~score, ~id_lbk, ~level_lbk,
#' ~score, ~id, ~level,
#' 1, "aa", "ultimate_parent",
#' 1, "aa", "direct_loantaker",
#' 1, "bb", "intermediate_parent",
Expand All @@ -41,15 +41,15 @@
#'
#' # Using a custom priority
#' bad_idea <- select_chr(
#' matched$level_lbk,
#' matched$level,
#' matches("intermediate"),
#' everything()
#' )
#' bad_idea
#'
#' prioritize(matched, priority = bad_idea)
prioritize <- function(data, priority = NULL) {
check_crucial_names(data, c("id_lbk", "level_lbk", "score"))
check_crucial_names(data, c("id", "level", "score"))
priority <- set_priority(data, priority = priority)

old_groups <- dplyr::groups(data)
Expand All @@ -60,8 +60,8 @@ prioritize <- function(data, priority = NULL) {
perfect_matches <- filter(ungroup(data), .data$score == 1L)

out <- perfect_matches %>%
group_by(.data$id_lbk) %>%
prioritize_at(.at = "level_lbk", priority = priority) %>%
group_by(.data$id) %>%
prioritize_at(.at = "level", priority = priority) %>%
ungroup()

group_by(out, !!!old_groups)
Expand All @@ -80,7 +80,7 @@ set_priority <- function(data, priority) {
priority <- f(prioritize_level(data))
}

known_levels <- sort(unique(data$level_lbk))
known_levels <- sort(unique(data$level))
unknown_levels <- setdiff(priority, known_levels)
if (!identical(unknown_levels, character(0))) {
warning(
Expand All @@ -106,7 +106,7 @@ set_priority <- function(data, priority) {
#'
#' @examples
#' matched <- tibble::tibble(
#' level_lbk = c(
#' level = c(
#' "intermediate_parent_1",
#' "direct_loantaker",
#' "direct_loantaker",
Expand All @@ -119,7 +119,7 @@ set_priority <- function(data, priority) {
prioritize_level <- function(data) {
select_chr(
# Sort sufixes: e.g. intermediate*1, *2, *n
sort(unique(data$level_lbk)),
sort(unique(data$level)),
tidyselect::matches("direct"),
tidyselect::matches("intermediate"),
tidyselect::matches("ultimate")
Expand Down
10 changes: 5 additions & 5 deletions R/score_alias_similarity.R
Expand Up @@ -47,11 +47,11 @@
#' left_join(ald, by = c("alias_ald" = "alias")) %>%
#' rename(sector_y = sector)
score_alias_similarity <- function(loanbook,
ald,
...,
by_sector = TRUE,
method = "jw",
p = 0.1) {
ald,
...,
by_sector = TRUE,
method = "jw",
p = 0.1) {
ellipsis::check_dots_used()

if (by_sector) {
Expand Down
8 changes: 4 additions & 4 deletions README.Rmd
Expand Up @@ -90,7 +90,7 @@ your_ald <- ald_demo

### 2. Score the goodness of the match between the loanbook and ald datasets

`match_name()` scores the match between names in a loanbook dataset (lbk) and names in an asset-level dataset (ald). The names come from the columns `name_direct_loantaker` and `name_ultimate_parent` of the loanbook dataset, and from the column `name_company` of the a asset-level dataset. The raw names are first transformed and stored in the columns `alias_lbk` and `alias_ald`. Then the similarity between `alias_lbk` and `alias_ald` is scored using `stringdist::stringsim()`. The process to create the `alias_*` columns applies best-practices commonly used in name matching algorithms, such as:
`match_name()` scores the match between names in a loanbook dataset (lbk) and names in an asset-level dataset (ald). The names come from the columns `name_direct_loantaker` and `name_ultimate_parent` of the loanbook dataset, and from the column `name_company` of the a asset-level dataset. The raw names are first transformed and stored in the columns `alias` and `alias_ald`. Then the similarity between `alias` and `alias_ald` is scored using `stringdist::stringsim()`. The process to create the `alias_*` columns applies best-practices commonly used in name matching algorithms, such as:

* Remove special characters.
* Replace language specific characters.
Expand Down Expand Up @@ -133,7 +133,7 @@ matched %>%

* Open _matched.csv_ with any spreadsheet editor (e.g. MS Excel, Google Sheets).

* Visually compare `alias_lbk` and `alias_ald`, along with the loanbook sector.
* Visually compare `alias` and `alias_ald`, along with the loanbook sector.

* Edit the data manually:
* If you are happy with the match, set the `score` value to `1`.
Expand All @@ -155,7 +155,7 @@ matched <- read_csv("matched_edited.csv")
The `matched` dataset may have multiple matches per loan. To get the best match only, use `priorityze()` -- it picks rows where `score` is 1 and `level` per loan is of highest `priority()`.

```{r}
some_interesting_columns <- vars(id_lbk, level_lbk, starts_with("alias"), score)
some_interesting_columns <- vars(id, level, starts_with("alias"), score)
matched %>%
prioritize() %>%
Expand All @@ -180,7 +180,7 @@ You may also pass a character vector with a custom priority -- which you may wri

```{r}
bad_idea <- select_chr(
matched$level_lbk,
matched$level,
matches("intermediate"),
everything()
)
Expand Down
132 changes: 65 additions & 67 deletions README.md
Expand Up @@ -138,10 +138,10 @@ your_ald <- ald_demo
columns `name_direct_loantaker` and `name_ultimate_parent` of the
loanbook dataset, and from the column `name_company` of the a
asset-level dataset. The raw names are first transformed and stored in
the columns `alias_lbk` and `alias_ald`. Then the similarity between
`alias_lbk` and `alias_ald` is scored using `stringdist::stringsim()`.
The process to create the `alias_*` columns applies best-practices
commonly used in name matching algorithms, such as:
the columns `alias` and `alias_ald`. Then the similarity between `alias`
and `alias_ald` is scored using `stringdist::stringsim()`. The process
to create the `alias_*` columns applies best-practices commonly used in
name matching algorithms, such as:

- Remove special characters.
- Replace language specific characters.
Expand All @@ -153,29 +153,27 @@ commonly used in name matching algorithms, such as:
``` r
match_name(your_loanbook, your_ald)
#> # A tibble: 1,350 x 26
#> alias_lbk alias_ald score id_lbk sector_lbk source_lbk name_ald sector_ald
#> <chr> <chr> <dbl> <chr> <chr> <chr> <chr> <chr>
#> 1 astonmar… astonmar… 1 UP23 automotive loanbook aston m… automotive
#> 2 astonmar… astonmar… 1 UP23 automotive loanbook aston m… automotive
#> 3 astonmar… astonmar… 1 UP23 automotive loanbook aston m… automotive
#> 4 avtozaz avtozaz 1 UP25 automotive loanbook avtozaz automotive
#> 5 avtozaz avtozaz 1 UP25 automotive loanbook avtozaz automotive
#> 6 avtozaz avtozaz 1 UP25 automotive loanbook avtozaz automotive
#> 7 bogdan bogdan 1 UP36 automotive loanbook bogdan automotive
#> 8 bogdan bogdan 1 UP36 automotive loanbook bogdan automotive
#> 9 bogdan bogdan 1 UP36 automotive loanbook bogdan automotive
#> 10 chauto chauto 1 UP52 automotive loanbook ch auto automotive
#> # … with 1,340 more rows, and 18 more variables: id_loan_lbk <chr>,
#> # id_direct_loantaker_lbk <chr>, id_intermediate_parent_1_lbk <chr>,
#> # id_ultimate_parent_lbk <chr>, loan_size_outstanding_lbk <dbl>,
#> # loan_size_outstanding_currency_lbk <chr>, loan_size_credit_limit_lbk <dbl>,
#> # loan_size_credit_limit_currency_lbk <chr>,
#> # sector_classification_system_lbk <chr>,
#> # sector_classification_input_type_lbk <chr>,
#> # sector_classification_direct_loantaker_lbk <dbl>, fi_type_lbk <chr>,
#> # flag_project_finance_loan_lbk <chr>, name_project_lbk <lgl>,
#> # lei_direct_loantaker_lbk <lgl>, isin_direct_loantaker_lbk <lgl>,
#> # level_lbk <chr>, name_lbk <chr>
#> alias alias_ald score id sector source name_ald sector_ald id_loan
#> <chr> <chr> <dbl> <chr> <chr> <chr> <chr> <chr> <chr>
#> 1 asto… astonmar… 1 UP23 autom… loanb… aston m… automotive <NA>
#> 2 asto… astonmar… 1 UP23 autom… loanb… aston m… automotive <NA>
#> 3 asto… astonmar… 1 UP23 autom… loanb… aston m… automotive <NA>
#> 4 avto… avtozaz 1 UP25 autom… loanb… avtozaz automotive <NA>
#> 5 avto… avtozaz 1 UP25 autom… loanb… avtozaz automotive <NA>
#> 6 avto… avtozaz 1 UP25 autom… loanb… avtozaz automotive <NA>
#> 7 bogd… bogdan 1 UP36 autom… loanb… bogdan automotive <NA>
#> 8 bogd… bogdan 1 UP36 autom… loanb… bogdan automotive <NA>
#> 9 bogd… bogdan 1 UP36 autom… loanb… bogdan automotive <NA>
#> 10 chau… chauto 1 UP52 autom… loanb… ch auto automotive <NA>
#> # … with 1,340 more rows, and 17 more variables: id_direct_loantaker <chr>,
#> # id_intermediate_parent_1 <chr>, id_ultimate_parent <chr>,
#> # loan_size_outstanding <dbl>, loan_size_outstanding_currency <chr>,
#> # loan_size_credit_limit <dbl>, loan_size_credit_limit_currency <chr>,
#> # sector_classification_system <chr>, sector_classification_input_type <chr>,
#> # sector_classification_direct_loantaker <dbl>, fi_type <chr>,
#> # flag_project_finance_loan <chr>, name_project <lgl>,
#> # lei_direct_loantaker <lgl>, isin_direct_loantaker <lgl>, level <chr>,
#> # name <chr>
```

`match_name()` defaults to scoring matches between `alias_*` strings
Expand Down Expand Up @@ -217,8 +215,8 @@ matched %>%
- Open *matched.csv* with any spreadsheet editor (e.g. MS Excel,
Google Sheets).

- Visually compare `alias_lbk` and `alias_ald`, along with the
loanbook sector.
- Visually compare `alias` and `alias_ald`, along with the loanbook
sector.

- Edit the data manually:

Expand All @@ -244,24 +242,24 @@ best match only, use `priorityze()` – it picks rows where `score` is 1
and `level` per loan is of highest `priority()`.

``` r
some_interesting_columns <- vars(id_lbk, level_lbk, starts_with("alias"), score)
some_interesting_columns <- vars(id, level, starts_with("alias"), score)

matched %>%
prioritize() %>%
select(!!! some_interesting_columns)
#> # A tibble: 402 x 5
#> id_lbk level_lbk alias_lbk alias_ald score
#> <chr> <chr> <chr> <chr> <dbl>
#> 1 UP23 direct_loantaker astonmartin astonmartin 1
#> 2 UP25 direct_loantaker avtozaz avtozaz 1
#> 3 UP36 direct_loantaker bogdan bogdan 1
#> 4 UP52 direct_loantaker chauto chauto 1
#> 5 UP53 direct_loantaker chehejia chehejia 1
#> 6 UP58 direct_loantaker chtcauto chtcauto 1
#> 7 UP80 direct_loantaker dongfenghonda dongfenghonda 1
#> 8 UP79 direct_loantaker dongfengluxgen dongfengluxgen 1
#> 9 UP89 direct_loantaker electricmobilitysoluti… electricmobilitysoluti… 1
#> 10 UP94 direct_loantaker faradayfuture faradayfuture 1
#> id level alias alias_ald score
#> <chr> <chr> <chr> <chr> <dbl>
#> 1 UP23 direct_loantaker astonmartin astonmartin 1
#> 2 UP25 direct_loantaker avtozaz avtozaz 1
#> 3 UP36 direct_loantaker bogdan bogdan 1
#> 4 UP52 direct_loantaker chauto chauto 1
#> 5 UP53 direct_loantaker chehejia chehejia 1
#> 6 UP58 direct_loantaker chtcauto chtcauto 1
#> 7 UP80 direct_loantaker dongfenghonda dongfenghonda 1
#> 8 UP79 direct_loantaker dongfengluxgen dongfengluxgen 1
#> 9 UP89 direct_loantaker electricmobilitysolutio… electricmobilitysoluti… 1
#> 10 UP94 direct_loantaker faradayfuture faradayfuture 1
#> # … with 392 more rows
```

Expand All @@ -281,18 +279,18 @@ matched %>%
prioritize(priority = rev) %>%
select(!!! some_interesting_columns)
#> # A tibble: 402 x 5
#> id_lbk level_lbk alias_lbk alias_ald score
#> <chr> <chr> <chr> <chr> <dbl>
#> 1 UP23 ultimate_parent astonmartin astonmartin 1
#> 2 UP25 ultimate_parent avtozaz avtozaz 1
#> 3 UP36 ultimate_parent bogdan bogdan 1
#> 4 UP52 ultimate_parent chauto chauto 1
#> 5 UP53 ultimate_parent chehejia chehejia 1
#> 6 UP58 ultimate_parent chtcauto chtcauto 1
#> 7 UP80 ultimate_parent dongfenghonda dongfenghonda 1
#> 8 UP79 ultimate_parent dongfengluxgen dongfengluxgen 1
#> 9 UP89 ultimate_parent electricmobilitysolutio… electricmobilitysoluti… 1
#> 10 UP94 ultimate_parent faradayfuture faradayfuture 1
#> id level alias alias_ald score
#> <chr> <chr> <chr> <chr> <dbl>
#> 1 UP23 ultimate_parent astonmartin astonmartin 1
#> 2 UP25 ultimate_parent avtozaz avtozaz 1
#> 3 UP36 ultimate_parent bogdan bogdan 1
#> 4 UP52 ultimate_parent chauto chauto 1
#> 5 UP53 ultimate_parent chehejia chehejia 1
#> 6 UP58 ultimate_parent chtcauto chtcauto 1
#> 7 UP80 ultimate_parent dongfenghonda dongfenghonda 1
#> 8 UP79 ultimate_parent dongfengluxgen dongfengluxgen 1
#> 9 UP89 ultimate_parent electricmobilitysolutio… electricmobilitysolutio… 1
#> 10 UP94 ultimate_parent faradayfuture faradayfuture 1
#> # … with 392 more rows
```

Expand All @@ -301,7 +299,7 @@ may write explicitly or with the help of `select_chr()`.

``` r
bad_idea <- select_chr(
matched$level_lbk,
matched$level,
matches("intermediate"),
everything()
)
Expand All @@ -313,17 +311,17 @@ matched %>%
prioritize(priority = bad_idea) %>%
select(!!! some_interesting_columns)
#> # A tibble: 402 x 5
#> id_lbk level_lbk alias_lbk alias_ald score
#> <chr> <chr> <chr> <chr> <dbl>
#> 1 UP23 intermediate_paren… astonmartin astonmartin 1
#> 2 UP25 intermediate_paren… avtozaz avtozaz 1
#> 3 UP36 intermediate_paren… bogdan bogdan 1
#> 4 UP52 intermediate_paren… chauto chauto 1
#> 5 UP53 intermediate_paren… chehejia chehejia 1
#> 6 UP58 intermediate_paren… chtcauto chtcauto 1
#> 7 UP80 intermediate_paren… dongfenghonda dongfenghonda 1
#> 8 UP79 intermediate_paren… dongfengluxgen dongfengluxgen 1
#> 9 UP89 intermediate_paren… electricmobilitysolut… electricmobilitysolu… 1
#> 10 UP94 intermediate_paren… faradayfuture faradayfuture 1
#> id level alias alias_ald score
#> <chr> <chr> <chr> <chr> <dbl>
#> 1 UP23 intermediate_paren… astonmartin astonmartin 1
#> 2 UP25 intermediate_paren… avtozaz avtozaz 1
#> 3 UP36 intermediate_paren… bogdan bogdan 1
#> 4 UP52 intermediate_paren… chauto chauto 1
#> 5 UP53 intermediate_paren… chehejia chehejia 1
#> 6 UP58 intermediate_paren… chtcauto chtcauto 1
#> 7 UP80 intermediate_paren… dongfenghonda dongfenghonda 1
#> 8 UP79 intermediate_paren… dongfengluxgen dongfengluxgen 1
#> 9 UP89 intermediate_paren… electricmobilitysolut… electricmobilitysolut… 1
#> 10 UP94 intermediate_paren… faradayfuture faradayfuture 1
#> # … with 392 more rows
```
6 changes: 3 additions & 3 deletions man/match_name.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 2 additions & 2 deletions man/prioritize.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion man/prioritize_level.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

0 comments on commit e7fd027

Please sign in to comment.