From 227b34f5ff3bd528e222ae37e2c1211927cc46ab Mon Sep 17 00:00:00 2001 From: ZJ Dai Date: Mon, 31 Jan 2022 21:11:58 +1100 Subject: [PATCH 01/16] minor before serious update --- R/data.table.r | 37 +------------------------------------ R/recommend_nchunks.r | 20 ++++++++++++++++---- R/util.r | 27 +++++++++++++++++++++++++++ 3 files changed, 44 insertions(+), 40 deletions(-) diff --git a/R/data.table.r b/R/data.table.r index 0d12184..9d64812 100644 --- a/R/data.table.r +++ b/R/data.table.r @@ -24,42 +24,7 @@ code = substitute(chunk[...]) - # sometimes the arguments could be empty - # in a recent version of globals that would cause a fail - # to avoid the fail remove them from the test - #dotdotdot_for_find_global = dotdotdot[!sapply(sapply(dotdotdot, as.character), function(x) all(unlist(x) == ""))] - - #ag = globals::findGlobals(dotdotdot_for_find_global) - #ag = setdiff(ag, "") # "" can cause issues with future # this line no longer needed - - - # you need to use list otherwise the names will be gone - if (paste0(deparse(code), collapse="") == "chunk_fn(NULL)") { - globals_and_pkgs = future::getGlobalsAndPackages(expression(chunk_fn())) - } else { - globals_and_pkgs = future::getGlobalsAndPackages(code) - } - - - global_vars = globals_and_pkgs$globals - - env = parent.frame() - - done = identical(env, emptyenv()) || identical(env, globalenv()) - - # keep adding global variables by moving up the environment chain - while(!done) { - tmp_globals_and_pkgs = future::getGlobalsAndPackages(code, envir = env) - new_global_vars = tmp_globals_and_pkgs$globals - for (name in setdiff(names(new_global_vars), names(global_vars))) { - global_vars[[name]] <- new_global_vars[[name]] - } - - done = identical(env, emptyenv()) || identical(env, globalenv()) - env = parent.env(env) - } - - globals_and_pkgs$globals = global_vars + globals_and_pkgs = find_globals_recursively(code, parent.frame()) res = future.apply::future_lapply(get_chunk_ids(df, full.names = TRUE), function(chunk_id) { #res = lapply(get_chunk_ids(df, full.names = TRUE), function(chunk_id) { diff --git a/R/recommend_nchunks.r b/R/recommend_nchunks.r index 2b98760..6b98a2d 100644 --- a/R/recommend_nchunks.r +++ b/R/recommend_nchunks.r @@ -1,3 +1,4 @@ + #' Recommend number of chunks based on input size #' @description Computes the recommended number of chunks to break a data.frame #' into. It can accept filesizes in bytes (as integer) or a data.frame @@ -119,10 +120,21 @@ df_ram_size <- function() { ram_size = benchmarkme::get_ram()/1024^3 if(is.na(ram_size)) { - warning("RAM size can't be determined. Assume you have 16GB of RAM.") - warning("Please report this error at github.com/xiaodaigh/disk.frame/issues") - warning(glue::glue("Please include your operating system, R version, and if using RStudio the Rstudio version number")) - return(16) + # try another method + os = R.version$os + ram = suppressWarnings(try(system_ram(os), silent=TRUE)) + + if (class(ram) == "try-error" || length(ram) == 0 || + is.na(ram)) { + warning("RAM size can't be determined. Assume you have 16GB of RAM.") + warning("Please report this error at github.com/xiaodaigh/disk.frame/issues") + warning(glue::glue("Please include your operating system, R version, and if using RStudio the Rstudio version number")) + return(16) + } else { + sum(as.numeric(ram), na.rm=TRUE) + + } + } else { ram_size = max(ram_size, 1, na.rm = TRUE) return(ram_size) diff --git a/R/util.r b/R/util.r index 8b760f0..bc91de3 100644 --- a/R/util.r +++ b/R/util.r @@ -45,3 +45,30 @@ purrr_as_mapper <- function(.f) { } return(.f) } + +#' Find globals in an expression by searching through the chain +find_globals_recursively <- function(code, envir) { + globals_and_pkgs = future::getGlobalsAndPackages(code, envir) + + global_vars = globals_and_pkgs$globals + + env = parent.env(envir) + + done = identical(env, emptyenv()) || identical(env, globalenv()) + + # keep adding global variables by moving up the environment chain + while(!done) { + tmp_globals_and_pkgs = future::getGlobalsAndPackages(code, envir = env) + new_global_vars = tmp_globals_and_pkgs$globals + for (name in setdiff(names(new_global_vars), names(global_vars))) { + global_vars[[name]] <- new_global_vars[[name]] + } + + done = identical(env, emptyenv()) || identical(env, globalenv()) + env = parent.env(env) + } + + globals_and_pkgs$globals = global_vars + + return(globals_and_pkgs) +} \ No newline at end of file From 9794c15f02f7d3f6ab7847335a5c815219b966ee Mon Sep 17 00:00:00 2001 From: ZJ Dai Date: Mon, 31 Jan 2022 22:04:07 +1100 Subject: [PATCH 02/16] perfected the globals var capture for group-by and summarize as well --- R/collect.summarized_disk.frame.r | 30 +++++++++++++-- R/one-stage-verbs.R | 64 ++++++++++++++++++++++++------- tests/testthat/test-group-by.R | 45 ++++++++++++++++++++++ 3 files changed, 123 insertions(+), 16 deletions(-) diff --git a/R/collect.summarized_disk.frame.r b/R/collect.summarized_disk.frame.r index edd686a..a641e90 100644 --- a/R/collect.summarized_disk.frame.r +++ b/R/collect.summarized_disk.frame.r @@ -74,8 +74,18 @@ collect.summarized_disk.frame <- } } + group_by_globals_list = attr(x_as.disk.frame, "group_by_globals_and_pkgs")$globals + + if(is.null(group_by_globals_list)) { + eval_clos = parent.frame() + } else { + eval_clos = list2env(group_by_globals_list, parent=parent.frame()) + } + # TODO add appropriate environment - tmp_df = eval(first_stage_code) + # tmp_df = eval(first_stage_code, envir=environment(), enclos = eval_clos + tmp_df = eval(first_stage_code, group_by_globals_list) + n_summ_args = length(dotdotdot_chunk_agg) @@ -103,7 +113,16 @@ collect.summarized_disk.frame <- ) } - tmp2 = collect(eval(parse(text = chunk_summ_code_str))) + summarize_globals_list = attr(x_as.disk.frame, "summarize_globals_and_pkgs")$globals + + if(is.null(summarize_globals_list)) { + summ_eval_clos = parent.frame() + } else { + summ_eval_clos = list2env(summarize_globals_list, parent=parent.frame()) + } + + #tmp2 = collect(eval(parse(text = chunk_summ_code_str), envir = environment(), enclos=summ_eval_clos)) + tmp2 = collect(eval(parse(text = chunk_summ_code_str), envir = summarize_globals_list)) second_stage_code = eval(parse(text = sprintf( "quote(group_by(tmp2, %s))", paste0(rep_len("NULL", n_grp_args), collapse = ", ") @@ -111,7 +130,12 @@ collect.summarized_disk.frame <- if (n_grp_args >= 1) { for (i in 1:n_grp_args) { - second_stage_code[[i + 2]] = group_by_vars[[i]] + second_stage_code[[i + 2]] = group_by_vars[[i]] %>% + deparse() %>% + paste0(collapse="") %>% + sprintf("`%s`", .) %>% + parse(text=.) %>% + .[[1]] } } diff --git a/R/one-stage-verbs.R b/R/one-stage-verbs.R index 27f62f2..f96d226 100644 --- a/R/one-stage-verbs.R +++ b/R/one-stage-verbs.R @@ -209,19 +209,34 @@ IQR_df.collected_agg.disk.frame <- function(listx, ...) { #' @rdname group_by #' @export summarise.grouped_disk.frame <- function(.data, ...) { + + class(.data) <- c("summarized_disk.frame", "disk.frame") + # get all components of the summarise dotdotdot = rlang::enexprs(...) # convert any quosure to labels for (i in seq_along(dotdotdot)) { - if("quosure" %in% class(dotdotdot[[i]])) { - dotdotdot[[i]] <- rlang::sym(rlang::as_label(dotdotdot[[i]])) - } + dotdotdot[[i]] <- rlang::as_label(dotdotdot[[i]]) %>% + parse(text = .) %>% + .[[1]] } - class(.data) <- c("summarized_disk.frame", "disk.frame") attr(.data, "summarize_code") = dotdotdot + # detect any global variables + args_str = sapply(dotdotdot, function(code) { + deparse(code) %>% + paste0(collapse="") + }) %>% paste(collapse = ", ") + + + attr(.data, "summarize_globals_and_pkgs") = + find_globals_recursively( + parse(text=sprintf("list(%s)", args_str))[[1]], + parent.frame() + ) + return(.data) } @@ -246,7 +261,6 @@ summarize.grouped_disk.frame = summarise.grouped_disk.frame #' @rdname group_by # learning from https://docs.dask.org/en/latest/dataframe-groupby.html group_by.disk.frame <- function(.data, ..., .add = FALSE, .drop = stop("disk.frame does not support `.drop` in `group_by` at this stage")) { - class(.data) <- c("grouped_disk.frame", "disk.frame") # using rlang is a neccesary evil here as I need to deal with !!! that is supported by group_by etc @@ -254,14 +268,23 @@ group_by.disk.frame <- function(.data, ..., .add = FALSE, .drop = stop("disk.fra # convert any quosure to labels for (i in seq_along(group_by_cols)) { - if("quosure" %in% class(group_by_cols[[i]])) { - group_by_cols[[i]] <- rlang::sym(rlang::as_label(group_by_cols[[i]])) - } + group_by_cols[[i]] <- group_by_cols[[i]] %>% + rlang::as_label() %>% + parse(text=.) %>% + .[[1]] } - attr(.data, "group_by_cols") = group_by_cols + # detect any global variables + args_str = sapply(group_by_cols, function(code) { + deparse(code) %>% + paste0(collapse="") + }) %>% paste(collapse = ", ") + + + attr(.data, "group_by_globals_and_pkgs") = find_globals_recursively(parse(text=sprintf("list(%s)", args_str))[[1]], parent.frame()) + .data } @@ -270,19 +293,34 @@ group_by.disk.frame <- function(.data, ..., .add = FALSE, .drop = stop("disk.fra #' @importFrom dplyr summarize #' @rdname group_by summarize.disk.frame <- function(.data, ...) { + + class(.data) <- c("summarized_disk.frame", "disk.frame") + # get all components of the summarise dotdotdot = rlang::enexprs(...) # convert any quosure to labels for (i in seq_along(dotdotdot)) { - if("quosure" %in% class(dotdotdot[[i]])) { - dotdotdot[[i]] <- rlang::sym(rlang::as_label(dotdotdot[[i]])) - } + dotdotdot[[i]] <- rlang::as_label(dotdotdot[[i]]) %>% + parse(text=.) %>% + .[[1]] } - class(.data) <- c("summarized_disk.frame", "disk.frame") attr(.data, "summarize_code") = dotdotdot + # detect any global variables + args_str = sapply(dotdotdot, function(code) { + deparse(code) %>% + paste0(collapse="") + }) %>% paste(collapse = ", ") + + + attr(.data, "summarize_globals_and_pkgs") = + find_globals_recursively( + parse(text=sprintf("list(%s)", args_str))[[1]], + parent.frame() + ) + return(.data) } diff --git a/tests/testthat/test-group-by.R b/tests/testthat/test-group-by.R index e61f75d..e28b610 100644 --- a/tests/testthat/test-group-by.R +++ b/tests/testthat/test-group-by.R @@ -299,6 +299,51 @@ test_that("tests for {{}}", { expect_equal(a, b) }) + +test_that("tests for global", { + # TODO make this work + val = 2 + val2 = 2 + + b = iris %>% + as.disk.frame %>% + group_by(as.integer(Species) + val, as.integer(Species) + val2) %>% + summarize(mean(Petal.Length)) %>% + collect + + a = iris %>% + group_by(as.integer(Species) + val, as.integer(Species) + val2) %>% + summarize(mean(Petal.Length)) %>% + collect + + expect_equal(b, a) + + b = iris %>% + as.disk.frame %>% + group_by(as.integer(Species) + val, as.integer(Species) + val2) %>% + summarize(mean(Petal.Length+val2)) %>% + collect + + a = iris %>% + group_by(as.integer(Species) + val, as.integer(Species) + val2) %>% + summarize(mean(Petal.Length+val2)) %>% + collect + + expect_equal(a, b) + + a = iris %>% + summarize(mean(Petal.Length+val2)) + + b = iris %>% + as.disk.frame %>% + summarize(mean(Petal.Length+val2)) %>% + collect + + expect_equal(names(a), names(b)) + + expect_equal(a[[1]], b[[1]]) +}) + teardown({ # fs::file_delete(file.path(tempdir(), "tmp_pls_delete_gb.csv")) # fs::dir_delete(file.path(tempdir(), "tmp_pls_delete_gb.df")) From 7c302f31328440652775bc42662ee142676c8c0e Mon Sep 17 00:00:00 2001 From: ZJ Dai Date: Mon, 31 Jan 2022 22:16:27 +1100 Subject: [PATCH 03/16] minor addition to data.table test --- tests/testthat/test-data-table.r | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tests/testthat/test-data-table.r b/tests/testthat/test-data-table.r index d162945..fb58f32 100644 --- a/tests/testthat/test-data-table.r +++ b/tests/testthat/test-data-table.r @@ -45,10 +45,12 @@ test_that("data.table global vars", { } # Check function with data.table object - grep_storm_name(storms_dt, "^A") + a = grep_storm_name(storms_dt, "^A") # Check function with diskframe object - grep_storm_name(storms_df, "^A") + b = grep_storm_name(storms_df, "^A") + + expect_equal(a, b) }) teardown({ From 1d0a52b682d2926793a153d4e6aae657204743fb Mon Sep 17 00:00:00 2001 From: ZJ Dai Date: Mon, 31 Jan 2022 22:38:48 +1100 Subject: [PATCH 04/16] add dplyr test --- tests/testthat/test-dplyr-verbs.r | 130 ++++++++++++++++++------------ 1 file changed, 77 insertions(+), 53 deletions(-) diff --git a/tests/testthat/test-dplyr-verbs.r b/tests/testthat/test-dplyr-verbs.r index 0131455..95ccd48 100644 --- a/tests/testthat/test-dplyr-verbs.r +++ b/tests/testthat/test-dplyr-verbs.r @@ -7,31 +7,31 @@ setup({ test_that("testing select", { b = disk.frame(file.path(tempdir(), "tmp_b_dv.df")) - - df = b %>% - select(a) %>% + + df = b %>% + select(a) %>% collect - + expect_equal(ncol(df), 1) }) test_that("testing rename", { b = disk.frame(file.path(tempdir(), "tmp_b_dv.df")) - - df = b %>% - rename(a_new_name = a) %>% + + df = b %>% + rename(a_new_name = a) %>% collect - + expect_setequal(colnames(df), c("a_new_name", "b")) }) test_that("testing filter", { b = disk.frame(file.path(tempdir(), "tmp_b_dv.df")) - - df = b %>% - filter(a <= 100, b <= 10) %>% + + df = b %>% + filter(a <= 100, b <= 10) %>% collect - + expect_setequal(nrow(df), 10) }) @@ -44,109 +44,118 @@ test_that("testing filter - global vars", { filter(a <= one_hundred, b <= 10) %>% collect + expect_setequal(nrow(df), 10) + + + # testing_env = new.env() + # + # assign("one_hundred", 100, testing_env) + # assign("b", disk.frame(file.path(tempdir(), "tmp_b_dv.df")), testing_env) + # + # eval(parse(text=code), envir = testing_env, enclos=emptyenv()) }) test_that("testing mutate", { b = disk.frame(file.path(tempdir(), "tmp_b_dv.df")) - - - df = b %>% - mutate(d = a + b) %>% + + + df = b %>% + mutate(d = a + b) %>% collect - + expect_setequal(sum(df$d), sum(df$a, df$b)) - - df = b %>% + + df = b %>% mutate(e = rank(desc(a))) %>% collect - + expect_equal(nrow(df), 100) - + # need to test value <- as.disk.frame(tibble(char = LETTERS, num = 1:26)) df2 = value %>% dplyr::mutate(b = case_when( char %in% c("A", "B", "C") ~ "1", - TRUE ~ char)) %>% + TRUE ~ char)) %>% collect - + expect_equal(ncol(df2), 3) - + # testing fn = function(a, b) { a+b } - + df3 = value %>% dplyr::mutate(b = fn(num, num)) %>% collect - + expect_equal(ncol(df3), 3) - - + + global_var = 100 - + df4 = value %>% dplyr::mutate(b = fn(num, num), d = global_var*2) %>% collect - + expect_equal(ncol(df4), 4) expect_true(all(df4$d == 200)) }) test_that("testing mutate user-defined function", { b = disk.frame(file.path(tempdir(), "tmp_b_dv.df")) - - + + udf = function(a1, b1) { a1 + b1 } - + df = b %>% mutate(d = udf(a,b)) %>% collect - + expect_setequal(sum(df$d), sum(df$a, df$b)) }) test_that("testing transmute", { b = disk.frame(file.path(tempdir(), "tmp_b_dv.df")) - - df = b %>% - transmute(d = a + b) %>% + + df = b %>% + transmute(d = a + b) %>% collect - + expect_setequal(names(df), c("d")) }) test_that("testing arrange", { b = disk.frame(file.path(tempdir(), "tmp_b_dv.df")) - + expect_warning(df <- b %>% - mutate(random_unif = runif(dplyr::n())) %>% + mutate(random_unif = runif(dplyr::n())) %>% arrange(desc(random_unif))) - + df <- b %>% - mutate(random_unif = runif(dplyr::n())) %>% + mutate(random_unif = runif(dplyr::n())) %>% chunk_arrange(desc(random_unif)) - + x = purrr::map_lgl(1:nchunks(df), ~{ is.unsorted(.x) == FALSE }) - + expect_true(all(x)) }) test_that("testing chunk_summarise", { b = disk.frame(file.path(tempdir(), "tmp_b_dv.df")) - + df = b %>% - chunk_summarise(suma = sum(a)) %>% - collect %>% + chunk_summarise(suma = sum(a)) %>% + collect %>% summarise(suma = sum(suma)) - + expect_equal(df$suma, collect(b)$a %>% sum) }) @@ -154,21 +163,36 @@ test_that("testing mutate within function works", { test_f <- function(params, x_df){ x_df %>% mutate(aha = params[1]*cyl + params[2]*disp) } - + expect_true("aha" %in% names(test_f(c(1, 2), mtcars))) }) test_that("filter failure: prevent github #191 regression", { flights_df = as.disk.frame(nycflights13::flights) - + # expect error due to syntax error - expect_error(flights_df %>% - filter(tailnum %in% paste0(unique(nycflights13::flights$tailnum)[1:60]), "") %>% + expect_error(flights_df %>% + filter(tailnum %in% paste0(unique(nycflights13::flights$tailnum)[1:60]), "") %>% collect) - + delete(flights_df) }) +test_that("testing {{}}", { + ok <- function(input_data, col) { + val = 2 + input_data %>% + mutate({{col}} + val, val2 = speed+dist) %>% + collect + } + + a = ok(as.disk.frame(cars), dist) + + b = ok(cars, dist) + + expect_equal(a, b) +}) + teardown({ fs::dir_delete(file.path(tempdir(), "tmp_b_dv.df")) From e30336538e63345ce41b65fb15e6edc57aa902c6 Mon Sep 17 00:00:00 2001 From: ZJ Dai Date: Mon, 31 Jan 2022 23:21:33 +1100 Subject: [PATCH 05/16] fixed bugs --- R/chunk_mapper.r | 25 +++++++++++++++++++++++-- R/one-stage-verbs.R | 14 ++++---------- R/recommend_nchunks.r | 1 - 3 files changed, 27 insertions(+), 13 deletions(-) diff --git a/R/chunk_mapper.r b/R/chunk_mapper.r index 1b0d3ff..936667d 100644 --- a/R/chunk_mapper.r +++ b/R/chunk_mapper.r @@ -42,10 +42,31 @@ create_chunk_mapper <- function(chunk_fn, warning_msg = NULL, as.data.frame = FA warning(warning_msg) } + dotdotdot = rlang::enexprs(...) + + # convert any quosure to labels + for (i in seq_along(dotdotdot)) { + dotdotdot[[i]] <- dotdotdot[[i]] %>% + rlang::quo_squash() + } + + args_str = mapply(function(name, val) { + rhs = deparse(val) %>% + paste0(collapse = "") + if(name != "") { + sprintf("%s=%s", name, rhs) + } else { + rhs + } + }, names(dotdotdot), dotdotdot) %>% + paste0(collapse = ", ") + + code = parse(text=sprintf("chunk_fn(.disk.frame.chunk, %s)", args_str))[[1]] + # you need to use list otherwise the names will be gone - code = substitute(chunk_fn(.disk.frame.chunk, ...)) + # code = substitute(chunk_fn(.disk.frame.chunk, ...)) - if (paste0(deparse(code), collapse="") == "chunk_fn(NULL)") { + if (paste0(deparse(code), collapse="") == "chunk_fn(.disk.frame.chunk, NULL)") { globals_and_pkgs = future::getGlobalsAndPackages(expression(chunk_fn())) } else { globals_and_pkgs = future::getGlobalsAndPackages(code) diff --git a/R/one-stage-verbs.R b/R/one-stage-verbs.R index f96d226..b9fabba 100644 --- a/R/one-stage-verbs.R +++ b/R/one-stage-verbs.R @@ -209,7 +209,6 @@ IQR_df.collected_agg.disk.frame <- function(listx, ...) { #' @rdname group_by #' @export summarise.grouped_disk.frame <- function(.data, ...) { - class(.data) <- c("summarized_disk.frame", "disk.frame") # get all components of the summarise @@ -217,9 +216,7 @@ summarise.grouped_disk.frame <- function(.data, ...) { # convert any quosure to labels for (i in seq_along(dotdotdot)) { - dotdotdot[[i]] <- rlang::as_label(dotdotdot[[i]]) %>% - parse(text = .) %>% - .[[1]] + dotdotdot[[i]] <- rlang::quo_squash(dotdotdot[[i]]) } attr(.data, "summarize_code") = dotdotdot @@ -269,9 +266,7 @@ group_by.disk.frame <- function(.data, ..., .add = FALSE, .drop = stop("disk.fra # convert any quosure to labels for (i in seq_along(group_by_cols)) { group_by_cols[[i]] <- group_by_cols[[i]] %>% - rlang::as_label() %>% - parse(text=.) %>% - .[[1]] + rlang::quo_squash() } attr(.data, "group_by_cols") = group_by_cols @@ -301,9 +296,8 @@ summarize.disk.frame <- function(.data, ...) { # convert any quosure to labels for (i in seq_along(dotdotdot)) { - dotdotdot[[i]] <- rlang::as_label(dotdotdot[[i]]) %>% - parse(text=.) %>% - .[[1]] + dotdotdot[[i]] <- dotdotdot[[i]] %>% + rlang::quo_squash() } attr(.data, "summarize_code") = dotdotdot diff --git a/R/recommend_nchunks.r b/R/recommend_nchunks.r index 6b98a2d..7d45fb8 100644 --- a/R/recommend_nchunks.r +++ b/R/recommend_nchunks.r @@ -56,7 +56,6 @@ recommend_nchunks <- function(df, type = "csv", minchunks = data.table::getDTthr #' # returns the RAM size in gigabyte (GB) #' df_ram_size() df_ram_size <- function() { - #browser() tryCatch({ ram_size = NULL # the amount of memory available in gigabytes From d5a7b510441a7f8616d94135a9d4a57cd779e1c4 Mon Sep 17 00:00:00 2001 From: ZJ Dai Date: Tue, 1 Feb 2022 00:10:27 +1100 Subject: [PATCH 06/16] auto srckeep! --- R/collect.summarized_disk.frame.r | 19 +++++++++++++++++++ man/find_globals_recursively.Rd | 11 +++++++++++ 2 files changed, 30 insertions(+) create mode 100644 man/find_globals_recursively.Rd diff --git a/R/collect.summarized_disk.frame.r b/R/collect.summarized_disk.frame.r index a641e90..cad8ad2 100644 --- a/R/collect.summarized_disk.frame.r +++ b/R/collect.summarized_disk.frame.r @@ -26,7 +26,26 @@ collect.summarized_disk.frame <- function(x, ..., parallel = !is.null(attr(x, "recordings"))) { dotdotdot <- attr(x, 'summarize_code') + group_by_vars = attr(x, "group_by_cols") + + browser() + + # look at the group by and summarise codes and figure out which columns need to be + # srckeep + df_to_find_cols = fst::read_fst(get_chunk_ids(x, full.names = TRUE)[1], from=1, to=1) + + cols_in_summ = lapply(dotdotdot, function(one) { + globals::findGlobals(one, envir = list2env(df_to_find_cols, parent=globalenv())) + }) %>% unlist %>% unique + + cols_in_group_by = lapply(group_by_vars, function(one) { + globals::findGlobals(one, envir = list2env(df_to_find_cols, parent=globalenv())) + }) %>% unlist %>% unique + + src_keep_cols = intersect(names(df_to_find_cols), c(cols_in_summ, cols_in_group_by) %>% unique) + x = srckeep(x, src_keep_cols) + # make a copy dotdotdot_chunk_agg <- dotdotdot dotdotdot_collected_agg <- dotdotdot diff --git a/man/find_globals_recursively.Rd b/man/find_globals_recursively.Rd new file mode 100644 index 0000000..bfca17b --- /dev/null +++ b/man/find_globals_recursively.Rd @@ -0,0 +1,11 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/util.r +\name{find_globals_recursively} +\alias{find_globals_recursively} +\title{Find globals in an expression by searching through the chain} +\usage{ +find_globals_recursively(code, envir) +} +\description{ +Find globals in an expression by searching through the chain +} From 36cd12988b8f7ea848c0e6d79122bbb02eb783b7 Mon Sep 17 00:00:00 2001 From: ZJ Dai Date: Tue, 1 Feb 2022 00:17:47 +1100 Subject: [PATCH 07/16] update for release --- R/collect.summarized_disk.frame.r | 1 + R/recommend_nchunks.r | 11 ----------- R/util.r | 2 ++ 3 files changed, 3 insertions(+), 11 deletions(-) diff --git a/R/collect.summarized_disk.frame.r b/R/collect.summarized_disk.frame.r index cad8ad2..5f10de5 100644 --- a/R/collect.summarized_disk.frame.r +++ b/R/collect.summarized_disk.frame.r @@ -13,6 +13,7 @@ #' @importFrom data.table data.table as.data.table #' @importFrom purrr map_dfr #' @importFrom dplyr collect select mutate +#' @importFrom globals findGlobals #' @return collect return a data.frame/data.table #' @examples #' cars.df = as.disk.frame(cars) diff --git a/R/recommend_nchunks.r b/R/recommend_nchunks.r index 7d45fb8..b8cd560 100644 --- a/R/recommend_nchunks.r +++ b/R/recommend_nchunks.r @@ -119,21 +119,10 @@ df_ram_size <- function() { ram_size = benchmarkme::get_ram()/1024^3 if(is.na(ram_size)) { - # try another method - os = R.version$os - ram = suppressWarnings(try(system_ram(os), silent=TRUE)) - - if (class(ram) == "try-error" || length(ram) == 0 || - is.na(ram)) { warning("RAM size can't be determined. Assume you have 16GB of RAM.") warning("Please report this error at github.com/xiaodaigh/disk.frame/issues") warning(glue::glue("Please include your operating system, R version, and if using RStudio the Rstudio version number")) return(16) - } else { - sum(as.numeric(ram), na.rm=TRUE) - - } - } else { ram_size = max(ram_size, 1, na.rm = TRUE) return(ram_size) diff --git a/R/util.r b/R/util.r index bc91de3..1bdb097 100644 --- a/R/util.r +++ b/R/util.r @@ -47,6 +47,8 @@ purrr_as_mapper <- function(.f) { } #' Find globals in an expression by searching through the chain +#' @param code An expression to search for globals +#' @param envir The environment from which to begin the search find_globals_recursively <- function(code, envir) { globals_and_pkgs = future::getGlobalsAndPackages(code, envir) From 6852d353a7d53c633edb0e588b7e035459376518 Mon Sep 17 00:00:00 2001 From: ZJ Dai Date: Tue, 1 Feb 2022 23:00:30 +1100 Subject: [PATCH 08/16] bug fixed --- DESCRIPTION | 5 +- NAMESPACE | 1 + NEWS.md | 5 + R/chunk_mapper.r | 8 +- R/collect.r | 7 +- R/collect.summarized_disk.frame.r | 2 - R/get_chunk.r | 6 +- R/util.r | 4 + docs/404.html | 2 +- docs/LICENSE-text.html | 2 +- docs/articles/01-intro.html | 2 +- docs/articles/02-intro-disk-frame.html | 38 +++--- docs/articles/03-concepts.html | 2 +- docs/articles/04-ingesting-data.html | 2 +- docs/articles/05-data-table-syntax.html | 2 +- docs/articles/06-vs-dask-juliadb.html | 10 +- docs/articles/07-glm.html | 2 +- docs/articles/08-more-epic.html | 2 +- docs/articles/09-convenience-features.html | 2 +- docs/articles/10-group-by.html | 2 +- docs/articles/11-custom-group-by.html | 2 +- docs/articles/88-trouble-shooting.html | 2 +- docs/articles/index.html | 2 +- docs/authors.html | 6 +- docs/index.html | 34 ++--- docs/news/index.html | 8 +- docs/pkgdown.yml | 2 +- docs/reference/add_chunk.html | 10 +- docs/reference/as.data.frame.disk.frame.html | 2 +- docs/reference/as.data.table.disk.frame.html | 2 +- docs/reference/as.disk.frame.html | 2 +- docs/reference/bind_rows.disk.frame.html | 2 +- docs/reference/chunk_group_by.html | 2 +- docs/reference/cmap.html | 4 +- docs/reference/cmap2.html | 2 +- docs/reference/collect.html | 2 +- docs/reference/colnames.html | 2 +- docs/reference/compute.disk.frame.html | 2 +- docs/reference/create_chunk_mapper.html | 2 +- docs/reference/csv_to_disk.frame.html | 2 +- docs/reference/delete.html | 2 +- docs/reference/df_ram_size.html | 2 +- docs/reference/dfglm.html | 2 +- docs/reference/disk.frame.html | 4 +- docs/reference/dplyr_verbs.html | 2 +- docs/reference/evalparseglue.html | 2 +- docs/reference/find_globals_recursively.html | 133 +++++++++++++++++++ docs/reference/foverlaps.disk.frame.html | 2 +- docs/reference/gen_datatable_synthetic.html | 2 +- docs/reference/get_chunk.html | 2 +- docs/reference/get_chunk_ids.html | 14 +- docs/reference/group_by.html | 2 +- docs/reference/groups.disk.frame.html | 2 +- docs/reference/head_tail.html | 2 +- docs/reference/index.html | 6 +- docs/reference/is_disk.frame.html | 2 +- docs/reference/join.html | 2 +- docs/reference/make_glm_streaming_fn.html | 2 +- docs/reference/merge.disk.frame.html | 2 +- docs/reference/move_to.html | 2 +- docs/reference/nchunks.html | 2 +- docs/reference/ncol_nrow.html | 2 +- docs/reference/one-stage-group-by-verbs.html | 2 +- docs/reference/overwrite_check.html | 2 +- docs/reference/play.html | 2 +- docs/reference/print.disk.frame.html | 2 +- docs/reference/pull.disk.frame.html | 2 +- docs/reference/purrr_as_mapper.html | 2 +- docs/reference/rbindlist.disk.frame.html | 2 +- docs/reference/rechunk.html | 6 +- docs/reference/recommend_nchunks.html | 2 +- docs/reference/remove_chunk.html | 8 +- docs/reference/sample.html | 34 ++--- docs/reference/setup_disk.frame.html | 2 +- docs/reference/shard.html | 2 +- docs/reference/shardkey.html | 2 +- docs/reference/shardkey_equal.html | 2 +- docs/reference/show_ceremony.html | 2 +- docs/reference/srckeep.html | 2 +- docs/reference/sub-.disk.frame.html | 2 +- docs/reference/tbl_vars.disk.frame.html | 2 +- docs/reference/write_disk.frame.html | 2 +- docs/reference/zip_to_disk.frame.html | 4 +- docs/sitemap.xml | 3 + man/find_globals_recursively.Rd | 5 + utils/build_utils.R | 6 + 86 files changed, 333 insertions(+), 158 deletions(-) create mode 100644 docs/reference/find_globals_recursively.html diff --git a/DESCRIPTION b/DESCRIPTION index 1301c32..32cc40f 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,8 +1,8 @@ Type: Package Package: disk.frame Title: Larger-than-RAM Disk-Based Data Manipulation Framework -Version: 0.6.0 -Date: 2022-01-31 +Version: 0.6.1 +Date: 2022-02-01 Authors@R: c( person("Dai", "ZJ", email = "zhuojia.dai@gmail.com", role = c("aut", "cre")), person("Jacky", "Poon", role = c("ctb")) @@ -30,6 +30,7 @@ Imports: bit64, benchmarkme, purrr (>= 0.3.2), + globals, rlang Depends: R (>= 3.4), diff --git a/NAMESPACE b/NAMESPACE index 2def491..08ceb7a 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -195,6 +195,7 @@ importFrom(future,nbrOfWorkers) importFrom(future,plan) importFrom(future,sequential) importFrom(future.apply,future_lapply) +importFrom(globals,findGlobals) importFrom(glue,glue) importFrom(jsonlite,fromJSON) importFrom(jsonlite,toJSON) diff --git a/NEWS.md b/NEWS.md index e075c6d..c5e6aeb 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,8 @@ +# disk.frame 0.6.1 +* Fixed bug with data.table syntax +* Auto detection of srckeep in group by +* Global detection for group by and summarise + # disk.frame 0.6 * Much better NSE support in disk.frame! * removed `hard_arrange` and `hard_group_by` diff --git a/R/chunk_mapper.r b/R/chunk_mapper.r index 936667d..dc6b055 100644 --- a/R/chunk_mapper.r +++ b/R/chunk_mapper.r @@ -41,7 +41,6 @@ create_chunk_mapper <- function(chunk_fn, warning_msg = NULL, as.data.frame = FA if(!is.null(warning_msg)) { warning(warning_msg) } - dotdotdot = rlang::enexprs(...) # convert any quosure to labels @@ -61,7 +60,12 @@ create_chunk_mapper <- function(chunk_fn, warning_msg = NULL, as.data.frame = FA }, names(dotdotdot), dotdotdot) %>% paste0(collapse = ", ") - code = parse(text=sprintf("chunk_fn(.disk.frame.chunk, %s)", args_str))[[1]] + if (args_str == "") { + code = parse(text="chunk_fn(.disk.frame.chunk)")[[1]] + } else { + code = parse(text=sprintf("chunk_fn(.disk.frame.chunk, %s)", args_str))[[1]] + } + # you need to use list otherwise the names will be gone # code = substitute(chunk_fn(.disk.frame.chunk, ...)) diff --git a/R/collect.r b/R/collect.r index cf5add1..5181e9e 100644 --- a/R/collect.r +++ b/R/collect.r @@ -27,12 +27,13 @@ collect.disk.frame <- function(x, ..., parallel = !is.null(attr(x,"recordings")) cids = get_chunk_ids(x, full.names = TRUE, strip_extension = FALSE) if(nchunks(x) > 0) { if(parallel) { - tmp = future.apply::future_lapply(cids, function(.x) { - get_chunk(x, .x, full.names = TRUE) + tmp<-future.apply::future_lapply(cids, function(.x) { + + get_chunk.disk.frame(x, .x, full.names = TRUE) }, future.seed = TRUE) return(rbindlist(tmp)) } else { - purrr::map_dfr(cids, ~get_chunk(x, .x, full.names = TRUE)) + purrr::map_dfr(cids, ~get_chunk.disk.frame(x, .x, full.names = TRUE)) } } else { data.table() diff --git a/R/collect.summarized_disk.frame.r b/R/collect.summarized_disk.frame.r index 5f10de5..104ac60 100644 --- a/R/collect.summarized_disk.frame.r +++ b/R/collect.summarized_disk.frame.r @@ -29,8 +29,6 @@ collect.summarized_disk.frame <- dotdotdot <- attr(x, 'summarize_code') group_by_vars = attr(x, "group_by_cols") - browser() - # look at the group by and summarise codes and figure out which columns need to be # srckeep df_to_find_cols = fst::read_fst(get_chunk_ids(x, full.names = TRUE)[1], from=1, to=1) diff --git a/R/get_chunk.r b/R/get_chunk.r index 01c06e6..9ed53f9 100644 --- a/R/get_chunk.r +++ b/R/get_chunk.r @@ -86,11 +86,13 @@ get_chunk.disk.frame <- function(df, n, keep = NULL, full.names = FALSE, ...) { fst::read_fst(filename, columns = keep, as.data.table = TRUE,...) } } else { - if(typeof(keep)!="closure") { + if(typeof(keep)=="closure") { play(fst::read_fst(filename, as.data.table = TRUE,...), recordings) } else { - play(fst::read_fst(filename, columns = keep, as.data.table = TRUE,...), recordings) + tmp_df_input = fst::read_fst(filename, columns = keep, as.data.table = TRUE,...) + res = play(tmp_df_input, recordings) + return(res) } } } diff --git a/R/util.r b/R/util.r index 1bdb097..03987c4 100644 --- a/R/util.r +++ b/R/util.r @@ -54,6 +54,10 @@ find_globals_recursively <- function(code, envir) { global_vars = globals_and_pkgs$globals + done = identical(envir, emptyenv()) || identical(envir, globalenv()) + + if(done) return(globals_and_pkgs) + env = parent.env(envir) done = identical(env, emptyenv()) || identical(env, globalenv()) diff --git a/docs/404.html b/docs/404.html index 9ccac74..7a1a321 100644 --- a/docs/404.html +++ b/docs/404.html @@ -32,7 +32,7 @@ disk.frame - 0.6.0 + 0.6.1 diff --git a/docs/LICENSE-text.html b/docs/LICENSE-text.html index 5757cc4..9085d47 100644 --- a/docs/LICENSE-text.html +++ b/docs/LICENSE-text.html @@ -17,7 +17,7 @@ disk.frame - 0.6.0 + 0.6.1 diff --git a/docs/articles/01-intro.html b/docs/articles/01-intro.html index af35342..fa24bbe 100644 --- a/docs/articles/01-intro.html +++ b/docs/articles/01-intro.html @@ -33,7 +33,7 @@ disk.frame - 0.6.0 + 0.6.1 diff --git a/docs/articles/02-intro-disk-frame.html b/docs/articles/02-intro-disk-frame.html index 5526ed1..aadbfeb 100644 --- a/docs/articles/02-intro-disk-frame.html +++ b/docs/articles/02-intro-disk-frame.html @@ -33,7 +33,7 @@ disk.frame - 0.6.0 + 0.6.1 @@ -555,26 +555,26 @@

Sampling
 flights.df %>% sample_frac(0.01) %>% collect %>% head
 #>    year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
-#> 1: 2013     1  10     1614           1605         9     1926           1944
-#> 2: 2013     5  13     1136           1136         0     1225           1234
-#> 3: 2013     5  16     1428           1420         8     1535           1538
-#> 4: 2013    12  29     1536           1520        16     1817           1750
-#> 5: 2013    12  21      849            820        29     1315           1345
-#> 6: 2013    12  25     1624           1559        25     1846           1825
+#> 1: 2013     5  16      930            906        24     1235           1239
+#> 2: 2013    12  30      818            810         8     1008            955
+#> 3: 2013     2   6     1238           1240        -2     1536           1555
+#> 4: 2013    12  20      921            854        27     1226           1212
+#> 5: 2013     1  15     1452           1500        -8     1625           1619
+#> 6: 2013     2   3     1340           1320        20     1511           1509
 #>    arr_delay carrier flight tailnum origin dest air_time distance hour minute
-#> 1:       -18      DL   1508  N952DL    JFK  RSW      163     1074   16      5
-#> 2:        -9      EV   3830  N13955    EWR  PVD       35      160   11     36
-#> 3:        -3      EV   4284  N11536    EWR  ROC       45      246   14     20
-#> 4:        27      MQ   3553  N520MQ    LGA  XNA      181     1147   15     20
-#> 5:       -30      DL    454  N682DA    JFK  STT      188     1623    8     20
-#> 6:        21      EV   5567  N870AS    LGA  CAE       99      617   15     59
+#> 1:        -4      B6    641  N806JB    JFK  SFO      343     2586    9      6
+#> 2:        13      EV   5463  N740EV    LGA  BNA      142      764    8     10
+#> 3:       -19      AA   2041  N5EBAA    JFK  MIA      150     1089   12     40
+#> 4:        14      UA    997  N511UA    EWR  LAX      346     2454    8     54
+#> 5:         6      US   2179  N702UW    LGA  DCA       56      214   15      0
+#> 6:         2      EV   4628  N13123    EWR  STL      132      872   13     20
 #>              time_hour
-#> 1: 2013-01-10 21:00:00
-#> 2: 2013-05-13 15:00:00
-#> 3: 2013-05-16 18:00:00
-#> 4: 2013-12-29 20:00:00
-#> 5: 2013-12-21 13:00:00
-#> 6: 2013-12-25 20:00:00
+#> 1: 2013-05-16 13:00:00 +#> 2: 2013-12-30 13:00:00 +#> 3: 2013-02-06 17:00:00 +#> 4: 2013-12-20 13:00:00 +#> 5: 2013-01-15 20:00:00 +#> 6: 2013-02-03 18:00:00

Writing Data diff --git a/docs/articles/03-concepts.html b/docs/articles/03-concepts.html index 5797c66..2a9aefa 100644 --- a/docs/articles/03-concepts.html +++ b/docs/articles/03-concepts.html @@ -33,7 +33,7 @@ disk.frame - 0.6.0 + 0.6.1

diff --git a/docs/articles/04-ingesting-data.html b/docs/articles/04-ingesting-data.html index d9e1478..3c7635a 100644 --- a/docs/articles/04-ingesting-data.html +++ b/docs/articles/04-ingesting-data.html @@ -33,7 +33,7 @@ disk.frame - 0.6.0 + 0.6.1 diff --git a/docs/articles/05-data-table-syntax.html b/docs/articles/05-data-table-syntax.html index 7027aeb..68574b8 100644 --- a/docs/articles/05-data-table-syntax.html +++ b/docs/articles/05-data-table-syntax.html @@ -33,7 +33,7 @@ disk.frame - 0.6.0 + 0.6.1 diff --git a/docs/articles/06-vs-dask-juliadb.html b/docs/articles/06-vs-dask-juliadb.html index fe2bcd2..e757449 100644 --- a/docs/articles/06-vs-dask-juliadb.html +++ b/docs/articles/06-vs-dask-juliadb.html @@ -33,7 +33,7 @@ disk.frame - 0.6.0 + 0.6.1 @@ -189,14 +189,14 @@

disk.framesystem.time(setup_disk.frame()) # ~4s #> The number of workers available for disk.frame is 6 #> user system elapsed -#> 0.20 0.03 2.32 +#> 0.14 0.06 2.15

We note that there is some time needed for disk.frame to start up all the workers. Next we try to convert the largest CSV file to disk.frame format. The file to be converted is about 2.2GB in size

 time_to_convert_disk.frame = system.time(df1 <- csv_to_disk.frame("c:/data/Performance_2004Q3.txt", header = FALSE))[3]
 
 time_to_convert_disk.frame
 #> elapsed 
-#>    28.3
+#> 27.52

Now that we have converted it, we want to a count by the first column. To achieve this we use a “two-stage” aggregation strategy. Note that use keep="V1" to bring only the column V1 into RAM. This avoids the reading of other unnecessary columns and should speed-up the analysis significantly

 time_to_agg_disk.frame = system.time(summ <- df1[,.N, V1, keep = "V1"][, .(N = sum(N)), V1])
@@ -204,7 +204,7 @@ 
disk.frametime_to_agg_disk.frame #> user system elapsed -#> 0.13 0.03 8.48
+#> 0.15 0.05 7.84

We can inspect the result as well.

 summ
@@ -228,7 +228,7 @@ 
disk.framesummarise(N = n()) %>% collect) #> user system elapsed -#> 1.53 0.17 10.25
+#> 1.45 0.14 6.15

However, the dplyr syntax tends to be slightly slower than using data.table syntax. This may be improved as much of the overhead is due to inefficient use of NSE.

diff --git a/docs/articles/07-glm.html b/docs/articles/07-glm.html index 2b340f3..1580af1 100644 --- a/docs/articles/07-glm.html +++ b/docs/articles/07-glm.html @@ -33,7 +33,7 @@ disk.frame - 0.6.0 + 0.6.1
diff --git a/docs/articles/08-more-epic.html b/docs/articles/08-more-epic.html index f4170f9..33c55b6 100644 --- a/docs/articles/08-more-epic.html +++ b/docs/articles/08-more-epic.html @@ -33,7 +33,7 @@ disk.frame - 0.6.0 + 0.6.1 diff --git a/docs/articles/09-convenience-features.html b/docs/articles/09-convenience-features.html index ef7bf2e..81fdd50 100644 --- a/docs/articles/09-convenience-features.html +++ b/docs/articles/09-convenience-features.html @@ -33,7 +33,7 @@ disk.frame - 0.6.0 + 0.6.1 diff --git a/docs/articles/10-group-by.html b/docs/articles/10-group-by.html index f18f3b5..7262671 100644 --- a/docs/articles/10-group-by.html +++ b/docs/articles/10-group-by.html @@ -33,7 +33,7 @@ disk.frame - 0.6.0 + 0.6.1 diff --git a/docs/articles/11-custom-group-by.html b/docs/articles/11-custom-group-by.html index 695f467..d9f8d61 100644 --- a/docs/articles/11-custom-group-by.html +++ b/docs/articles/11-custom-group-by.html @@ -33,7 +33,7 @@ disk.frame - 0.6.0 + 0.6.1 diff --git a/docs/articles/88-trouble-shooting.html b/docs/articles/88-trouble-shooting.html index e1ba2f4..b7c39b6 100644 --- a/docs/articles/88-trouble-shooting.html +++ b/docs/articles/88-trouble-shooting.html @@ -33,7 +33,7 @@ disk.frame - 0.6.0 + 0.6.1 diff --git a/docs/articles/index.html b/docs/articles/index.html index 3e4cba2..e505d22 100644 --- a/docs/articles/index.html +++ b/docs/articles/index.html @@ -17,7 +17,7 @@ disk.frame - 0.6.0 + 0.6.1 diff --git a/docs/authors.html b/docs/authors.html index 9722619..72268a0 100644 --- a/docs/authors.html +++ b/docs/authors.html @@ -17,7 +17,7 @@ disk.frame - 0.6.0 + 0.6.1 @@ -110,13 +110,13 @@

Citation

ZJ D (2022). disk.frame: Larger-than-RAM Disk-Based Data Manipulation Framework. -R package version 0.6.0, https://diskframe.com. +R package version 0.6.1, https://diskframe.com.

@Manual{,
   title = {disk.frame: Larger-than-RAM Disk-Based Data Manipulation Framework},
   author = {Dai ZJ},
   year = {2022},
-  note = {R package version 0.6.0},
+  note = {R package version 0.6.1},
   url = {https://diskframe.com},
 }
diff --git a/docs/index.html b/docs/index.html index 7ea2fc4..a664a0a 100644 --- a/docs/index.html +++ b/docs/index.html @@ -37,7 +37,7 @@ disk.frame - 0.6.0 + 0.6.1 @@ -228,6 +228,7 @@

Quick-start# this will setup disk.frame's parallel backend with number of workers equal to the number of CPU cores (hyper-threaded cores are counted as one not two) setup_disk.frame() +#> The number of workers available for disk.frame is 6 # this allows large datasets to be transferred between sessions options(future.globals.maxSize = Inf) @@ -247,12 +248,12 @@

dplyr verbsfilter(year == 2013) %>% mutate(origin_dest = paste0(origin, dest)) %>% head(2) -#> year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time -#> 1: 2013 1 1 517 515 2 830 819 -#> 2: 2013 1 1 533 529 4 850 830 -#> arr_delay carrier flight tailnum origin dest air_time distance hour minute -#> 1: 11 UA 1545 N14228 EWR IAH 227 1400 5 15 -#> 2: 20 UA 1714 N24211 LGA IAH 227 1416 5 29 +#> year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time arr_delay +#> 1: 2013 1 1 517 515 2 830 819 11 +#> 2: 2013 1 1 533 529 4 850 830 20 +#> carrier flight tailnum origin dest air_time distance hour minute +#> 1: UA 1545 N14228 EWR IAH 227 1400 5 15 +#> 2: UA 1714 N24211 LGA IAH 227 1416 5 29 #> time_hour origin_dest #> 1: 2013-01-01 05:00:00 EWRIAH #> 2: 2013-01-01 05:00:00 LGAIAH @@ -380,6 +381,7 @@

Example: data.table syntax.(qtr = ifelse(month <= 3, "Q1", "Q2")) ] ) +#> data.table syntax for disk.frame may be moved to a separate package in the future grp_by_stage1 #> qtr sum_dist @@ -405,24 +407,24 @@

Basic info
 # where is the disk.frame stored
 attr(flights.df, "path")
-#> [1] "C:\\Users\\RTX2080\\AppData\\Local\\Temp\\RtmpQH7obF\\file42d452c32907.df"
+#> [1] "C:\\Users\\RTX2080\\AppData\\Local\\Temp\\Rtmp6HjtTk\\file16141b3219fc.df"

A number of data.frame functions are implemented for disk.frame

 # get first few rows
 head(flights.df, 1)
-#>    year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
-#> 1: 2013     1   1      517            515         2      830            819
-#>    arr_delay carrier flight tailnum origin dest air_time distance hour minute
-#> 1:        11      UA   1545  N14228    EWR  IAH      227     1400    5     15
+#>    year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time arr_delay
+#> 1: 2013     1   1      517            515         2      830            819        11
+#>    carrier flight tailnum origin dest air_time distance hour minute
+#> 1:      UA   1545  N14228    EWR  IAH      227     1400    5     15
 #>              time_hour
 #> 1: 2013-01-01 05:00:00
 # get last few rows
 tail(flights.df, 1)
-#>    year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
-#> 1: 2013     9  30       NA            840        NA       NA           1020
-#>    arr_delay carrier flight tailnum origin dest air_time distance hour minute
-#> 1:        NA      MQ   3531  N839MQ    LGA  RDU       NA      431    8     40
+#>    year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time arr_delay
+#> 1: 2013     9  30       NA            840        NA       NA           1020        NA
+#>    carrier flight tailnum origin dest air_time distance hour minute
+#> 1:      MQ   3531  N839MQ    LGA  RDU       NA      431    8     40
 #>              time_hour
 #> 1: 2013-09-30 08:00:00
diff --git a/docs/news/index.html b/docs/news/index.html
index 48d1c26..b12731e 100644
--- a/docs/news/index.html
+++ b/docs/news/index.html
@@ -17,7 +17,7 @@
       
       
         disk.frame
-        0.6.0
+        0.6.1
       
     
@@ -90,6 +90,12 @@

Changelog

Source: NEWS.md +
+ +
  • Fixed bug with data.table syntax
  • +
  • Auto detection of srckeep in group by
  • +
  • Global detection for group by and summarise
  • +
  • Much better NSE support in disk.frame!
  • diff --git a/docs/pkgdown.yml b/docs/pkgdown.yml index d6970ad..961402e 100644 --- a/docs/pkgdown.yml +++ b/docs/pkgdown.yml @@ -14,5 +14,5 @@ articles: 10-group-by: 10-group-by.html 11-custom-group-by: 11-custom-group-by.html 88-trouble-shooting: 88-trouble-shooting.html -last_built: 2022-01-30T13:34Z +last_built: 2022-02-01T11:57Z diff --git a/docs/reference/add_chunk.html b/docs/reference/add_chunk.html index 33227b0..93564a5 100644 --- a/docs/reference/add_chunk.html +++ b/docs/reference/add_chunk.html @@ -18,7 +18,7 @@ disk.frame - 0.6.0 + 0.6.1
@@ -135,12 +135,12 @@

Examples

# add a chunk to diskf add_chunk(diskf, cars) -#> path: "C:\Users\RTX2080\AppData\Local\Temp\RtmpyknGIm/tmp_add_chunk" +#> path: "C:\Users\RTX2080\AppData\Local\Temp\RtmpC4Yg8P/tmp_add_chunk" #> nchunks: 1 #> nrow (at source): 50 #> ncol (at source): 2 add_chunk(diskf, cars) -#> path: "C:\Users\RTX2080\AppData\Local\Temp\RtmpyknGIm/tmp_add_chunk" +#> path: "C:\Users\RTX2080\AppData\Local\Temp\RtmpC4Yg8P/tmp_add_chunk" #> nchunks: 2 #> nrow (at source): 100 #> ncol (at source): 2 @@ -154,12 +154,12 @@

Examples

# you wish to add multiple chunk in parralel add_chunk(df2, data.frame(chunk=1), 1) -#> path: "C:\Users\RTX2080\AppData\Local\Temp\RtmpyknGIm/tmp_add_chunk2" +#> path: "C:\Users\RTX2080\AppData\Local\Temp\RtmpC4Yg8P/tmp_add_chunk2" #> nchunks: 1 #> nrow (at source): 1 #> ncol (at source): 1 add_chunk(df2, data.frame(chunk=2), 3) -#> path: "C:\Users\RTX2080\AppData\Local\Temp\RtmpyknGIm/tmp_add_chunk2" +#> path: "C:\Users\RTX2080\AppData\Local\Temp\RtmpC4Yg8P/tmp_add_chunk2" #> nchunks: 2 #> nrow (at source): 2 #> ncol (at source): 1 diff --git a/docs/reference/as.data.frame.disk.frame.html b/docs/reference/as.data.frame.disk.frame.html index aea1586..586cd04 100644 --- a/docs/reference/as.data.frame.disk.frame.html +++ b/docs/reference/as.data.frame.disk.frame.html @@ -17,7 +17,7 @@ disk.frame - 0.6.0 + 0.6.1 diff --git a/docs/reference/as.data.table.disk.frame.html b/docs/reference/as.data.table.disk.frame.html index 1e5146c..aaf5994 100644 --- a/docs/reference/as.data.table.disk.frame.html +++ b/docs/reference/as.data.table.disk.frame.html @@ -17,7 +17,7 @@ disk.frame - 0.6.0 + 0.6.1 diff --git a/docs/reference/as.disk.frame.html b/docs/reference/as.disk.frame.html index fa22a8e..780f6ec 100644 --- a/docs/reference/as.disk.frame.html +++ b/docs/reference/as.disk.frame.html @@ -17,7 +17,7 @@ disk.frame - 0.6.0 + 0.6.1 diff --git a/docs/reference/bind_rows.disk.frame.html b/docs/reference/bind_rows.disk.frame.html index 3a33645..26ccb67 100644 --- a/docs/reference/bind_rows.disk.frame.html +++ b/docs/reference/bind_rows.disk.frame.html @@ -17,7 +17,7 @@ disk.frame - 0.6.0 + 0.6.1 diff --git a/docs/reference/chunk_group_by.html b/docs/reference/chunk_group_by.html index f7bbff6..4560d29 100644 --- a/docs/reference/chunk_group_by.html +++ b/docs/reference/chunk_group_by.html @@ -34,7 +34,7 @@ disk.frame - 0.6.0 + 0.6.1 diff --git a/docs/reference/cmap.html b/docs/reference/cmap.html index 1a51879..1d06c61 100644 --- a/docs/reference/cmap.html +++ b/docs/reference/cmap.html @@ -21,7 +21,7 @@ disk.frame - 0.6.0 + 0.6.1 @@ -257,7 +257,7 @@

Examples

# return the first row of each chunk eagerly as list cmap(cars.df, ~.x[1,], lazy = FALSE) -#> path: "C:\Users\RTX2080\AppData\Local\Temp\RtmpyknGIm\file4718220342f7.df" +#> path: "C:\Users\RTX2080\AppData\Local\Temp\RtmpC4Yg8P\file126438e21837.df" #> nchunks: 6 #> nrow (at source): 50 #> ncol (at source): 2 diff --git a/docs/reference/cmap2.html b/docs/reference/cmap2.html index 336c10b..d1c8aca 100644 --- a/docs/reference/cmap2.html +++ b/docs/reference/cmap2.html @@ -18,7 +18,7 @@ disk.frame - 0.6.0 + 0.6.1 diff --git a/docs/reference/collect.html b/docs/reference/collect.html index 0d063a4..f3dce80 100644 --- a/docs/reference/collect.html +++ b/docs/reference/collect.html @@ -20,7 +20,7 @@ disk.frame - 0.6.0 + 0.6.1 diff --git a/docs/reference/colnames.html b/docs/reference/colnames.html index 210eada..29bd615 100644 --- a/docs/reference/colnames.html +++ b/docs/reference/colnames.html @@ -20,7 +20,7 @@ disk.frame - 0.6.0 + 0.6.1 diff --git a/docs/reference/compute.disk.frame.html b/docs/reference/compute.disk.frame.html index 5e9a9f2..82d13a1 100644 --- a/docs/reference/compute.disk.frame.html +++ b/docs/reference/compute.disk.frame.html @@ -17,7 +17,7 @@ disk.frame - 0.6.0 + 0.6.1 diff --git a/docs/reference/create_chunk_mapper.html b/docs/reference/create_chunk_mapper.html index 1269b5d..b2df53e 100644 --- a/docs/reference/create_chunk_mapper.html +++ b/docs/reference/create_chunk_mapper.html @@ -17,7 +17,7 @@ disk.frame - 0.6.0 + 0.6.1 diff --git a/docs/reference/csv_to_disk.frame.html b/docs/reference/csv_to_disk.frame.html index f91d899..6291bfe 100644 --- a/docs/reference/csv_to_disk.frame.html +++ b/docs/reference/csv_to_disk.frame.html @@ -17,7 +17,7 @@ disk.frame - 0.6.0 + 0.6.1 diff --git a/docs/reference/delete.html b/docs/reference/delete.html index 9cf2619..1b43be9 100644 --- a/docs/reference/delete.html +++ b/docs/reference/delete.html @@ -17,7 +17,7 @@ disk.frame - 0.6.0 + 0.6.1 diff --git a/docs/reference/df_ram_size.html b/docs/reference/df_ram_size.html index c986617..f9a9fb4 100644 --- a/docs/reference/df_ram_size.html +++ b/docs/reference/df_ram_size.html @@ -17,7 +17,7 @@ disk.frame - 0.6.0 + 0.6.1 diff --git a/docs/reference/dfglm.html b/docs/reference/dfglm.html index c9cc7c7..f3e79f4 100644 --- a/docs/reference/dfglm.html +++ b/docs/reference/dfglm.html @@ -18,7 +18,7 @@ disk.frame - 0.6.0 + 0.6.1 diff --git a/docs/reference/disk.frame.html b/docs/reference/disk.frame.html index fd1f955..30b2f5a 100644 --- a/docs/reference/disk.frame.html +++ b/docs/reference/disk.frame.html @@ -17,7 +17,7 @@ disk.frame - 0.6.0 + 0.6.1 @@ -111,7 +111,7 @@

Arguments

Examples

path = file.path(tempdir(),"cars")
 as.disk.frame(cars, outdir=path, overwrite = TRUE, nchunks = 2)
-#> path: "C:\Users\RTX2080\AppData\Local\Temp\RtmpyknGIm/cars"
+#> path: "C:\Users\RTX2080\AppData\Local\Temp\RtmpC4Yg8P/cars"
 #> nchunks: 2
 #> nrow (at source): 50
 #> ncol (at source): 2
diff --git a/docs/reference/dplyr_verbs.html b/docs/reference/dplyr_verbs.html
index 143e9c7..fbaf09b 100644
--- a/docs/reference/dplyr_verbs.html
+++ b/docs/reference/dplyr_verbs.html
@@ -18,7 +18,7 @@
       
       
         disk.frame
-        0.6.0
+        0.6.1
       
     
diff --git a/docs/reference/evalparseglue.html b/docs/reference/evalparseglue.html index d88bc80..22f6201 100644 --- a/docs/reference/evalparseglue.html +++ b/docs/reference/evalparseglue.html @@ -17,7 +17,7 @@ disk.frame - 0.6.0 + 0.6.1 diff --git a/docs/reference/find_globals_recursively.html b/docs/reference/find_globals_recursively.html new file mode 100644 index 0000000..57361cf --- /dev/null +++ b/docs/reference/find_globals_recursively.html @@ -0,0 +1,133 @@ + +Find globals in an expression by searching through the chain — find_globals_recursively • disk.frame + + +
+
+ + + +
+
+ + +
+

Find globals in an expression by searching through the chain

+
+ +
+
find_globals_recursively(code, envir)
+
+ +
+

Arguments

+
code
+

An expression to search for globals

+
envir
+

The environment from which to begin the search

+
+ +
+ +
+ + +
+ +
+

Site built with pkgdown 2.0.2.

+
+ +
+ + + + + + + + diff --git a/docs/reference/foverlaps.disk.frame.html b/docs/reference/foverlaps.disk.frame.html index 6fbc745..f968053 100644 --- a/docs/reference/foverlaps.disk.frame.html +++ b/docs/reference/foverlaps.disk.frame.html @@ -17,7 +17,7 @@ disk.frame - 0.6.0 + 0.6.1 diff --git a/docs/reference/gen_datatable_synthetic.html b/docs/reference/gen_datatable_synthetic.html index 8855f4c..642d489 100644 --- a/docs/reference/gen_datatable_synthetic.html +++ b/docs/reference/gen_datatable_synthetic.html @@ -17,7 +17,7 @@ disk.frame - 0.6.0 + 0.6.1 diff --git a/docs/reference/get_chunk.html b/docs/reference/get_chunk.html index fffdd1c..4dc6f17 100644 --- a/docs/reference/get_chunk.html +++ b/docs/reference/get_chunk.html @@ -17,7 +17,7 @@ disk.frame - 0.6.0 + 0.6.1 diff --git a/docs/reference/get_chunk_ids.html b/docs/reference/get_chunk_ids.html index fc6d82d..d1ea2e3 100644 --- a/docs/reference/get_chunk_ids.html +++ b/docs/reference/get_chunk_ids.html @@ -17,7 +17,7 @@ disk.frame - 0.6.0 + 0.6.1 @@ -121,12 +121,12 @@

Examples

# return the file name chunk IDs get_chunk_ids(cars.df, full.names = TRUE) -#> [1] "C:\\Users\\RTX2080\\AppData\\Local\\Temp\\RtmpyknGIm\\file471836dbe43.df/1.fst" -#> [2] "C:\\Users\\RTX2080\\AppData\\Local\\Temp\\RtmpyknGIm\\file471836dbe43.df/2.fst" -#> [3] "C:\\Users\\RTX2080\\AppData\\Local\\Temp\\RtmpyknGIm\\file471836dbe43.df/3.fst" -#> [4] "C:\\Users\\RTX2080\\AppData\\Local\\Temp\\RtmpyknGIm\\file471836dbe43.df/4.fst" -#> [5] "C:\\Users\\RTX2080\\AppData\\Local\\Temp\\RtmpyknGIm\\file471836dbe43.df/5.fst" -#> [6] "C:\\Users\\RTX2080\\AppData\\Local\\Temp\\RtmpyknGIm\\file471836dbe43.df/6.fst" +#> [1] "C:\\Users\\RTX2080\\AppData\\Local\\Temp\\RtmpC4Yg8P\\file12644d401fcc.df/1.fst" +#> [2] "C:\\Users\\RTX2080\\AppData\\Local\\Temp\\RtmpC4Yg8P\\file12644d401fcc.df/2.fst" +#> [3] "C:\\Users\\RTX2080\\AppData\\Local\\Temp\\RtmpC4Yg8P\\file12644d401fcc.df/3.fst" +#> [4] "C:\\Users\\RTX2080\\AppData\\Local\\Temp\\RtmpC4Yg8P\\file12644d401fcc.df/4.fst" +#> [5] "C:\\Users\\RTX2080\\AppData\\Local\\Temp\\RtmpC4Yg8P\\file12644d401fcc.df/5.fst" +#> [6] "C:\\Users\\RTX2080\\AppData\\Local\\Temp\\RtmpC4Yg8P\\file12644d401fcc.df/6.fst" # return the file name chunk IDs with file extension get_chunk_ids(cars.df, strip_extension = FALSE) diff --git a/docs/reference/group_by.html b/docs/reference/group_by.html index 7149e00..2ad41d2 100644 --- a/docs/reference/group_by.html +++ b/docs/reference/group_by.html @@ -20,7 +20,7 @@ disk.frame - 0.6.0 + 0.6.1 diff --git a/docs/reference/groups.disk.frame.html b/docs/reference/groups.disk.frame.html index 89dc5d9..30384b4 100644 --- a/docs/reference/groups.disk.frame.html +++ b/docs/reference/groups.disk.frame.html @@ -17,7 +17,7 @@ disk.frame - 0.6.0 + 0.6.1 diff --git a/docs/reference/head_tail.html b/docs/reference/head_tail.html index 01e0493..521dfe0 100644 --- a/docs/reference/head_tail.html +++ b/docs/reference/head_tail.html @@ -17,7 +17,7 @@ disk.frame - 0.6.0 + 0.6.1 diff --git a/docs/reference/index.html b/docs/reference/index.html index 7447de7..18dd32f 100644 --- a/docs/reference/index.html +++ b/docs/reference/index.html @@ -17,7 +17,7 @@ disk.frame - 0.6.0 + 0.6.1 @@ -176,6 +176,10 @@

All functions evalparseglue()

Helper function to evalparse some `glue::glue` string

+ +

find_globals_recursively()

+ +

Find globals in an expression by searching through the chain

foverlaps.disk.frame()

diff --git a/docs/reference/is_disk.frame.html b/docs/reference/is_disk.frame.html index d32d003..3caaa6e 100644 --- a/docs/reference/is_disk.frame.html +++ b/docs/reference/is_disk.frame.html @@ -17,7 +17,7 @@ disk.frame - 0.6.0 + 0.6.1 diff --git a/docs/reference/join.html b/docs/reference/join.html index 87b1e85..2ce46ce 100644 --- a/docs/reference/join.html +++ b/docs/reference/join.html @@ -17,7 +17,7 @@ disk.frame - 0.6.0 + 0.6.1 diff --git a/docs/reference/make_glm_streaming_fn.html b/docs/reference/make_glm_streaming_fn.html index 63de7ae..eef2b62 100644 --- a/docs/reference/make_glm_streaming_fn.html +++ b/docs/reference/make_glm_streaming_fn.html @@ -17,7 +17,7 @@ disk.frame - 0.6.0 + 0.6.1 diff --git a/docs/reference/merge.disk.frame.html b/docs/reference/merge.disk.frame.html index f93a7f5..0d3036b 100644 --- a/docs/reference/merge.disk.frame.html +++ b/docs/reference/merge.disk.frame.html @@ -17,7 +17,7 @@ disk.frame - 0.6.0 + 0.6.1 diff --git a/docs/reference/move_to.html b/docs/reference/move_to.html index d4211c0..cff2ae4 100644 --- a/docs/reference/move_to.html +++ b/docs/reference/move_to.html @@ -17,7 +17,7 @@ disk.frame - 0.6.0 + 0.6.1 diff --git a/docs/reference/nchunks.html b/docs/reference/nchunks.html index 9321954..fc2f3ae 100644 --- a/docs/reference/nchunks.html +++ b/docs/reference/nchunks.html @@ -17,7 +17,7 @@ disk.frame - 0.6.0 + 0.6.1 diff --git a/docs/reference/ncol_nrow.html b/docs/reference/ncol_nrow.html index 5b5d239..04baa3e 100644 --- a/docs/reference/ncol_nrow.html +++ b/docs/reference/ncol_nrow.html @@ -17,7 +17,7 @@ disk.frame - 0.6.0 + 0.6.1 diff --git a/docs/reference/one-stage-group-by-verbs.html b/docs/reference/one-stage-group-by-verbs.html index ac3c5ab..b16d4eb 100644 --- a/docs/reference/one-stage-group-by-verbs.html +++ b/docs/reference/one-stage-group-by-verbs.html @@ -19,7 +19,7 @@ disk.frame - 0.6.0 + 0.6.1 diff --git a/docs/reference/overwrite_check.html b/docs/reference/overwrite_check.html index a33eaeb..5bc041f 100644 --- a/docs/reference/overwrite_check.html +++ b/docs/reference/overwrite_check.html @@ -17,7 +17,7 @@ disk.frame - 0.6.0 + 0.6.1 diff --git a/docs/reference/play.html b/docs/reference/play.html index 286b185..6a06806 100644 --- a/docs/reference/play.html +++ b/docs/reference/play.html @@ -17,7 +17,7 @@ disk.frame - 0.6.0 + 0.6.1 diff --git a/docs/reference/print.disk.frame.html b/docs/reference/print.disk.frame.html index 76b587e..2c5f1bd 100644 --- a/docs/reference/print.disk.frame.html +++ b/docs/reference/print.disk.frame.html @@ -17,7 +17,7 @@ disk.frame - 0.6.0 + 0.6.1 diff --git a/docs/reference/pull.disk.frame.html b/docs/reference/pull.disk.frame.html index 172cb91..c93de3c 100644 --- a/docs/reference/pull.disk.frame.html +++ b/docs/reference/pull.disk.frame.html @@ -17,7 +17,7 @@ disk.frame - 0.6.0 + 0.6.1 diff --git a/docs/reference/purrr_as_mapper.html b/docs/reference/purrr_as_mapper.html index 4e334c1..ab7b96e 100644 --- a/docs/reference/purrr_as_mapper.html +++ b/docs/reference/purrr_as_mapper.html @@ -17,7 +17,7 @@ disk.frame - 0.6.0 + 0.6.1 diff --git a/docs/reference/rbindlist.disk.frame.html b/docs/reference/rbindlist.disk.frame.html index b50ff95..af351f4 100644 --- a/docs/reference/rbindlist.disk.frame.html +++ b/docs/reference/rbindlist.disk.frame.html @@ -17,7 +17,7 @@ disk.frame - 0.6.0 + 0.6.1 diff --git a/docs/reference/rechunk.html b/docs/reference/rechunk.html index b5f2b78..80eeda6 100644 --- a/docs/reference/rechunk.html +++ b/docs/reference/rechunk.html @@ -17,7 +17,7 @@ disk.frame - 0.6.0 + 0.6.1 @@ -126,8 +126,8 @@

Examples

# re-chunking cars.df to 3 chunks, done "in-place" to the same folder as cars.df rechunk(cars.df, 3) -#> files have been backed up to temporary dir C:\Users\RTX2080\AppData\Local\Temp\RtmpyknGIm\back_up_tmp_dir471815f02658. You can recover there files until you restart your R session -#> path: "C:\Users\RTX2080\AppData\Local\Temp\RtmpyknGIm\file471862cf5821.df" +#> files have been backed up to temporary dir C:\Users\RTX2080\AppData\Local\Temp\RtmpC4Yg8P\back_up_tmp_dir126429f0440c. You can recover there files until you restart your R session +#> path: "C:\Users\RTX2080\AppData\Local\Temp\RtmpC4Yg8P\file12641cab3271.df" #> nchunks: 3 #> nrow (at source): 50 #> ncol (at source): 2 diff --git a/docs/reference/recommend_nchunks.html b/docs/reference/recommend_nchunks.html index dd1c227..9514ebf 100644 --- a/docs/reference/recommend_nchunks.html +++ b/docs/reference/recommend_nchunks.html @@ -18,7 +18,7 @@ disk.frame - 0.6.0 + 0.6.1 diff --git a/docs/reference/remove_chunk.html b/docs/reference/remove_chunk.html index 232f00e..0cdd331 100644 --- a/docs/reference/remove_chunk.html +++ b/docs/reference/remove_chunk.html @@ -17,7 +17,7 @@ disk.frame - 0.6.0 + 0.6.1 @@ -116,7 +116,7 @@

Examples

# removes 3rd chunk remove_chunk(cars.df, 3) -#> path: "C:\Users\RTX2080\AppData\Local\Temp\RtmpyknGIm\file47184b42308d.df" +#> path: "C:\Users\RTX2080\AppData\Local\Temp\RtmpC4Yg8P\file1264402d4b28.df" #> nchunks: 3 #> nrow (at source): 37 #> ncol (at source): 2 @@ -125,7 +125,7 @@

Examples

# removes 4th chunk remove_chunk(cars.df, "4.fst") -#> path: "C:\Users\RTX2080\AppData\Local\Temp\RtmpyknGIm\file47184b42308d.df" +#> path: "C:\Users\RTX2080\AppData\Local\Temp\RtmpC4Yg8P\file1264402d4b28.df" #> nchunks: 2 #> nrow (at source): 26 #> ncol (at source): 2 @@ -134,7 +134,7 @@

Examples

# removes 2nd chunk remove_chunk(cars.df, file.path(attr(cars.df, "path", exact=TRUE), "2.fst"), full.names = TRUE) -#> path: "C:\Users\RTX2080\AppData\Local\Temp\RtmpyknGIm\file47184b42308d.df" +#> path: "C:\Users\RTX2080\AppData\Local\Temp\RtmpC4Yg8P\file1264402d4b28.df" #> nchunks: 1 #> nrow (at source): 13 #> ncol (at source): 2 diff --git a/docs/reference/sample.html b/docs/reference/sample.html index 056cef0..a3489f0 100644 --- a/docs/reference/sample.html +++ b/docs/reference/sample.html @@ -17,7 +17,7 @@ disk.frame - 0.6.0 + 0.6.1 @@ -128,27 +128,27 @@

Examples

collect(sample_frac(cars.df, 0.5)) #> speed dist #> 1: 7 4 -#> 2: 4 2 -#> 3: 10 18 -#> 4: 10 34 -#> 5: 11 17 -#> 6: 13 34 +#> 2: 7 22 +#> 3: 9 10 +#> 4: 4 10 +#> 5: 12 20 +#> 6: 11 28 #> 7: 13 26 -#> 8: 12 28 +#> 8: 13 34 #> 9: 15 26 -#> 10: 13 46 -#> 11: 16 32 -#> 12: 15 20 -#> 13: 18 42 -#> 14: 17 50 -#> 15: 19 36 +#> 10: 15 20 +#> 11: 13 46 +#> 12: 16 32 +#> 13: 18 84 +#> 14: 16 40 +#> 15: 18 56 #> 16: 18 76 #> 17: 20 56 -#> 18: 20 64 -#> 19: 23 54 +#> 18: 19 46 +#> 19: 19 68 #> 20: 20 52 -#> 21: 24 93 -#> 22: 25 85 +#> 21: 24 70 +#> 22: 24 120 #> speed dist # clean up cars.df diff --git a/docs/reference/setup_disk.frame.html b/docs/reference/setup_disk.frame.html index 6d502ba..6f180be 100644 --- a/docs/reference/setup_disk.frame.html +++ b/docs/reference/setup_disk.frame.html @@ -17,7 +17,7 @@ disk.frame - 0.6.0 + 0.6.1 diff --git a/docs/reference/shard.html b/docs/reference/shard.html index 7b59897..ed2ac68 100644 --- a/docs/reference/shard.html +++ b/docs/reference/shard.html @@ -18,7 +18,7 @@ disk.frame - 0.6.0 + 0.6.1 diff --git a/docs/reference/shardkey.html b/docs/reference/shardkey.html index 2fce89e..1af8a25 100644 --- a/docs/reference/shardkey.html +++ b/docs/reference/shardkey.html @@ -17,7 +17,7 @@ disk.frame - 0.6.0 + 0.6.1 diff --git a/docs/reference/shardkey_equal.html b/docs/reference/shardkey_equal.html index dbc9bcc..30e16fb 100644 --- a/docs/reference/shardkey_equal.html +++ b/docs/reference/shardkey_equal.html @@ -17,7 +17,7 @@ disk.frame - 0.6.0 + 0.6.1 diff --git a/docs/reference/show_ceremony.html b/docs/reference/show_ceremony.html index d1d69c5..cfc32e8 100644 --- a/docs/reference/show_ceremony.html +++ b/docs/reference/show_ceremony.html @@ -17,7 +17,7 @@ disk.frame - 0.6.0 + 0.6.1 diff --git a/docs/reference/srckeep.html b/docs/reference/srckeep.html index 8bfaee9..8643052 100644 --- a/docs/reference/srckeep.html +++ b/docs/reference/srckeep.html @@ -17,7 +17,7 @@ disk.frame - 0.6.0 + 0.6.1 diff --git a/docs/reference/sub-.disk.frame.html b/docs/reference/sub-.disk.frame.html index 3c758a2..51bb55f 100644 --- a/docs/reference/sub-.disk.frame.html +++ b/docs/reference/sub-.disk.frame.html @@ -17,7 +17,7 @@ disk.frame - 0.6.0 + 0.6.1 diff --git a/docs/reference/tbl_vars.disk.frame.html b/docs/reference/tbl_vars.disk.frame.html index 1df5aea..1c23b6e 100644 --- a/docs/reference/tbl_vars.disk.frame.html +++ b/docs/reference/tbl_vars.disk.frame.html @@ -18,7 +18,7 @@ disk.frame - 0.6.0 + 0.6.1 diff --git a/docs/reference/write_disk.frame.html b/docs/reference/write_disk.frame.html index 6eda846..4365dff 100644 --- a/docs/reference/write_disk.frame.html +++ b/docs/reference/write_disk.frame.html @@ -18,7 +18,7 @@ disk.frame - 0.6.0 + 0.6.1 diff --git a/docs/reference/zip_to_disk.frame.html b/docs/reference/zip_to_disk.frame.html index fe4335d..4e1388d 100644 --- a/docs/reference/zip_to_disk.frame.html +++ b/docs/reference/zip_to_disk.frame.html @@ -20,7 +20,7 @@ disk.frame - 0.6.0 + 0.6.1 @@ -144,7 +144,7 @@

Examples

# read every file and convert it to a disk.frame zip.df = zip_to_disk.frame(zipfile, tempfile(fileext = ".df")) -#> Error in unzip(zipfile, list = TRUE): zip file 'C:\Users\RTX2080\AppData\Local\Temp\RtmpyknGIm\file471855d8254c.zip' cannot be opened +#> Error in unzip(zipfile, list = TRUE): zip file 'C:\Users\RTX2080\AppData\Local\Temp\RtmpC4Yg8P\file126473c130c2.zip' cannot be opened # there is only one csv file so it return a list of one disk.frame zip.df[[1]] diff --git a/docs/sitemap.xml b/docs/sitemap.xml index 5df9216..cedcfdd 100644 --- a/docs/sitemap.xml +++ b/docs/sitemap.xml @@ -171,6 +171,9 @@ /reference/evalparseglue.html + + /reference/find_globals_recursively.html + /reference/foverlaps.disk.frame.html diff --git a/man/find_globals_recursively.Rd b/man/find_globals_recursively.Rd index bfca17b..2d7bc7f 100644 --- a/man/find_globals_recursively.Rd +++ b/man/find_globals_recursively.Rd @@ -6,6 +6,11 @@ \usage{ find_globals_recursively(code, envir) } +\arguments{ +\item{code}{An expression to search for globals} + +\item{envir}{The environment from which to begin the search} +} \description{ Find globals in an expression by searching through the chain } diff --git a/utils/build_utils.R b/utils/build_utils.R index 33103cf..3c008d9 100644 --- a/utils/build_utils.R +++ b/utils/build_utils.R @@ -205,6 +205,12 @@ df_release <- function() { devtools::release() } +df_ultimate <- function() { + df_check() + df_release() + df_build_site() +} + if(F) { df_check() } From 837a1ff434e6c368cb37731a038bcc5d4a4bea59 Mon Sep 17 00:00:00 2001 From: ZJ Dai Date: Wed, 2 Feb 2022 21:52:34 +1100 Subject: [PATCH 09/16] updated code --- DESCRIPTION | 4 +- NAMESPACE | 1 + R/collect.r | 56 +++++++++++++++++++++++--- R/collect.summarized_disk.frame.r | 7 ---- R/dplyr_verbs.r | 11 +++++ R/get_chunk.r | 39 +++++++++++++----- R/get_chunk_ids.r | 20 ++++----- R/get_partition.r | 67 +++++++++++++++++++++++++++++++ R/is_disk.frame.r | 12 ++++-- R/names.r | 12 +++--- R/nchunks.r | 2 +- R/ncol-nrow.r | 2 +- R/overwrite_check.r | 3 +- R/partition-filter.r | 13 ++++++ R/recommend_nchunks.r | 1 - R/write_disk.frame.r | 59 ++++++++++++++++++++++++--- man/get_chunk.Rd | 2 +- 17 files changed, 255 insertions(+), 56 deletions(-) create mode 100644 R/get_partition.r create mode 100644 R/partition-filter.r diff --git a/DESCRIPTION b/DESCRIPTION index 32cc40f..9ec2a53 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Type: Package Package: disk.frame Title: Larger-than-RAM Disk-Based Data Manipulation Framework -Version: 0.6.1 +Version: 0.6.999.999 Date: 2022-02-01 Authors@R: c( person("Dai", "ZJ", email = "zhuojia.dai@gmail.com", role = c("aut", "cre")), @@ -33,7 +33,7 @@ Imports: globals, rlang Depends: - R (>= 3.4), + R (>= 4.0), dplyr (>= 1.0.0) Suggests: nycflights13, diff --git a/NAMESPACE b/NAMESPACE index 08ceb7a..db105be 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -113,6 +113,7 @@ export(nchunks) export(ncol) export(nrow) export(overwrite_check) +export(partition_filter) export(quantile_df.chunk_agg.disk.frame) export(quantile_df.collected_agg.disk.frame) export(rbindlist.disk.frame) diff --git a/R/collect.r b/R/collect.r index 5181e9e..9834758 100644 --- a/R/collect.r +++ b/R/collect.r @@ -25,15 +25,61 @@ #' @rdname collect collect.disk.frame <- function(x, ..., parallel = !is.null(attr(x,"recordings"))) { cids = get_chunk_ids(x, full.names = TRUE, strip_extension = FALSE) + # obtain filters from structure + partitioned_paths_or_not = get_partition_paths(x) + + if (partitioned_paths_or_not$is_partitioned) { + partitioned_paths = partitioned_paths_or_not$paths + if(length(partitioned_paths) >= 1) { + # filter the cids based on the paths + tmp = data.frame(paths = cids) %>% + mutate(dirname = dirname(paths)) %>% + inner_join(data.frame(dirname = sapply(partitioned_paths, tools::file_path_as_absolute)), by = "dirname") + cids = tmp$paths + } + } + if(nchunks(x) > 0) { if(parallel) { - tmp<-future.apply::future_lapply(cids, function(.x) { - - get_chunk.disk.frame(x, .x, full.names = TRUE) - }, future.seed = TRUE) + tmp<-future.apply::future_lapply(cids, function(.x, meh) { + if(partitioned_paths_or_not$is_partitioned) { + dirpath = dirname(.x) + tmp2 = partitioned_paths_or_not$df %>% + mutate(fullpath = file.path(attr(x, "path") %>% tools::file_path_as_absolute(), .disk.frame.sub.path)) + + tmp2a = tmp2 %>% + filter(fullpath == dirpath) %>% + mutate(.check=1) + + stopifnot(nrow(tmp2a) == 1) + + tmp3 = get_chunk.disk.frame(x, .x, full.names = TRUE, partitioned_info = tmp2a) + return(tmp3) + } else { + return(get_chunk.disk.frame(x, .x, full.names = TRUE)) + } + }, future.seed = NULL) return(rbindlist(tmp)) } else { - purrr::map_dfr(cids, ~get_chunk.disk.frame(x, .x, full.names = TRUE)) + tmp<-lapply(cids, function(.x, meh) { + if(partitioned_paths_or_not$is_partitioned) { + dirpath = dirname(.x) + tmp2 = partitioned_paths_or_not$df %>% + mutate(fullpath = file.path(attr(x, "path") %>% tools::file_path_as_absolute(), .disk.frame.sub.path)) + + tmp2a = tmp2 %>% + filter(fullpath == dirpath) %>% + mutate(.check=1) + + stopifnot(nrow(tmp2a) == 1) + + tmp3 = get_chunk.disk.frame(x, .x, full.names = TRUE, partitioned_info = tmp2a) + return(tmp3) + } else { + return(get_chunk.disk.frame(x, .x, full.names = TRUE)) + } + }) + return(rbindlist(tmp)) } } else { data.table() diff --git a/R/collect.summarized_disk.frame.r b/R/collect.summarized_disk.frame.r index 104ac60..cff94ff 100644 --- a/R/collect.summarized_disk.frame.r +++ b/R/collect.summarized_disk.frame.r @@ -133,13 +133,6 @@ collect.summarized_disk.frame <- summarize_globals_list = attr(x_as.disk.frame, "summarize_globals_and_pkgs")$globals - if(is.null(summarize_globals_list)) { - summ_eval_clos = parent.frame() - } else { - summ_eval_clos = list2env(summarize_globals_list, parent=parent.frame()) - } - - #tmp2 = collect(eval(parse(text = chunk_summ_code_str), envir = environment(), enclos=summ_eval_clos)) tmp2 = collect(eval(parse(text = chunk_summ_code_str), envir = summarize_globals_list)) second_stage_code = eval(parse(text = sprintf( diff --git a/R/dplyr_verbs.r b/R/dplyr_verbs.r index 285c678..2924324 100644 --- a/R/dplyr_verbs.r +++ b/R/dplyr_verbs.r @@ -136,6 +136,17 @@ chunk_ungroup = create_chunk_mapper(dplyr::ungroup) # do not introduce it as it was never introduced #ungroup.disk.frame( < - create_dplyr_mapper(dplyr::ungroup, , warning_msg="`ungroup.disk.frame` is now deprecated. Please use `chunk_ungroup` instead. This is in preparation for a more powerful `group_by` framework") +add_count.disk.frame <- function(x, ...) { + warning("check if it works") + counts = x %>% + group_by({{...}}) %>% + summarize(n=n()) %>% + collect + + left_join.disk.frame(x, counts) +} + + #' @export #' @rdname dplyr_verbs diff --git a/R/get_chunk.r b/R/get_chunk.r index 9ed53f9..01700e0 100644 --- a/R/get_chunk.r +++ b/R/get_chunk.r @@ -24,7 +24,7 @@ get_chunk <- function(...) { #' @rdname get_chunk #' @importFrom fst read_fst #' @export -get_chunk.disk.frame <- function(df, n, keep = NULL, full.names = FALSE, ...) { +get_chunk.disk.frame <- function(df, n, keep = NULL, full.names = FALSE, ..., partitioned_info=NULL) { stopifnot("disk.frame" %in% class(df)) # keep_chunks = attr(df, "keep_chunks", exact=TRUE) @@ -44,9 +44,10 @@ get_chunk.disk.frame <- function(df, n, keep = NULL, full.names = FALSE, ...) { keep1_vars = paste0(keep1, collapse = ", ") keep_no_good_vars = setdiff(keep, keep1) %>% paste0(collapse = ", ") stop( - glue::glue( - "This disk.frame has a srckeep containing these variables {keep1_vars}. - You are trying to keep {keep_no_good_vars}, which are not available.")) + sprintf( + "This disk.frame has a `srckeep` containing these columns: `%s`. + You are trying to keep `%s`, which are not available.", + paste0(keep1_vars, collapse=", "), paste0(keep_no_good_vars, collapse=", "))) } keep = intersect(keep1, keep) if (!all(keep %in% keep1)) { @@ -78,21 +79,37 @@ get_chunk.disk.frame <- function(df, n, keep = NULL, full.names = FALSE, ...) { return(notbl) } - if (is.null(recordings)) { if(typeof(keep)=="closure") { - fst::read_fst(filename, as.data.table = TRUE,...) + tmp = fst::read_fst(filename, as.data.table = TRUE,...) } else { - fst::read_fst(filename, columns = keep, as.data.table = TRUE,...) + tmp = fst::read_fst(filename, columns = keep, as.data.table = TRUE,...) + } + + if(!is.null(partitioned_info)) { + res = tmp %>% + mutate(.check=1) %>% + full_join(partitioned_info %>% mutate(.check=1), by=".check") %>% + select(-.check, -fullpath, -.disk.frame.sub.path) + return(res) + } else{ + return(tmp) } } else { if(typeof(keep)=="closure") { - play(fst::read_fst(filename, as.data.table = TRUE,...), recordings) + tmp_df_input = fst::read_fst(filename, as.data.table = TRUE,...) } else { tmp_df_input = fst::read_fst(filename, columns = keep, as.data.table = TRUE,...) - - res = play(tmp_df_input, recordings) - return(res) + } + + if(!is.null(partitioned_info)) { + res = tmp_df_input %>% + mutate(.check=1) %>% + full_join(partitioned_info %>% mutate(.check=1), by=".check") %>% + select(-.check, -fullpath, -.disk.frame.sub.path) + return(play(res, recordings)) + } else{ + return(play(tmp_df_input, recordings)) } } } diff --git a/R/get_chunk_ids.r b/R/get_chunk_ids.r index bbb3890..9a023e9 100644 --- a/R/get_chunk_ids.r +++ b/R/get_chunk_ids.r @@ -20,19 +20,19 @@ #' # clean up cars.df #' delete(cars.df) get_chunk_ids <- function(df, ..., full.names = FALSE, strip_extension = TRUE) { - lf = list.files(attr(df,"path"), full.names = full.names, ...) + stopifnot("disk.frame" %in% class(df)) + + lf = list.files(attr(df,"path"), full.names = full.names, ..., recursive = TRUE) if(full.names) { return(lf) } - purrr::map_chr(lf, ~{ - tmp = stringr::str_split(.x,stringr::fixed("."), simplify = TRUE) - l = length(tmp) - if(l == 1) { - return(tmp) - } else if(strip_extension) { - paste0(tmp[-l], collapse="") - } else { - .x + + # strip out the path or file name if required + sapply(lf, function(path) { + tmp = basename(path) + if (strip_extension) { + tmp = tools::file_path_sans_ext(tmp) } + return(tmp) }) } diff --git a/R/get_partition.r b/R/get_partition.r new file mode 100644 index 0000000..925ff24 --- /dev/null +++ b/R/get_partition.r @@ -0,0 +1,67 @@ +#' Turn a string of the form /partion1=val/partion2=val2 into data.frame +split_string_into_df <- function(path_strs) { + paths = dirname(path_strs) %>% unique + list_of_partitions = stringr::str_split(paths, "/") + + tmp = mapply(function(partition, path) { + part_val = stringr::str_split(partition, "=") + tmp = lapply(part_val, function(part_val, lvl) { + tmp = data.frame(partition = part_val[2]) + names(tmp) = part_val[1] + tmp + }) %>% + do.call(cbind, .) + + tmp$.disk.frame.sub.path = path + + tmp + }, list_of_partitions, paths, SIMPLIFY = FALSE) %>% data.table::rbindlist() + + tmp +} + +if(F) { + df = disk.frame("C:/temp/ok.df") %>% + filter(partition == 1) +} + +#' Get the partitioning structure of a folder +get_partition_paths <- function(df) { + stopifnot("disk.frame" %in% class(df)) + path = tools::file_path_as_absolute(attr(df, "path")) + + allowed_paths = path + + meta_path = file.path(path, ".metadata") + + is_partitioned = FALSE + df_of_partitions = NULL + + # if it's a partitioned structure allow more search paths than root + if (length(setdiff(list.dirs(path, recursive = FALSE), meta_path)) >= 1) { + lf = list.files(path, full.names = FALSE, pattern="fst", recursive=TRUE) + + # create a data.frame of the paths so it can be filtered + df_of_partitions = split_string_into_df(lf) + # infer the types + df_of_partitions = type.convert(df_of_partitions, as.is=TRUE) + + # if there is a filter operation, filter the above to figure out + allowed_paths = df_of_partitions$.disk.frame.sub.path + + # filter for some paths if necessary + partition_filter_info = attr(df, "partition_filter") + if (!is.null(partition_filter_info)) { + # apply filter + df_of_partitions = eval(partition_filter_info$expr, list(dataframe=df_of_partitions)) + + allowed_paths = df_of_partitions$.disk.frame.sub.path + } + is_partitioned = TRUE + } + + # now go through the allowed paths + list(paths=file.path(attr(df, "path"), allowed_paths), is_partitioned=is_partitioned, df = df_of_partitions) + + # TODO check all files sit within the same structure +} diff --git a/R/is_disk.frame.r b/R/is_disk.frame.r index 542bd38..48f538d 100644 --- a/R/is_disk.frame.r +++ b/R/is_disk.frame.r @@ -10,24 +10,30 @@ #' # clean up cars.df #' delete(cars.df) is_disk.frame <- function(df) { - ##browser if("disk.frame" %in% class(df)) { df = attr(df, "path", exact=TRUE) } else if(!"character" %in% class(df)) { # character then check the path return(FALSE) } - files <- fs::dir_ls(df, type="file", all = TRUE) + files <- fs::dir_ls(df, type="file", all = TRUE) # if all files are fst if(length(files)>0) { if(any(purrr::map_lgl(files, ~length(grep(glob2rx("*.fst"), .x)) == 0))) { - # some of the fiels do not have a .fst extension + # some of the files do not have a .fst extension return(FALSE) } } dirs = fs::dir_ls(df, type="directory", all = TRUE) if(length(dirs) > 1) { + # are the directories of this form name=val + split_dirs = stringr::str_split(basename(setdiff(dirs, file.path(df, ".metadata"))), "=") + if (all(sapply(split_dirs, length) == 2)) { + # all folder are + # TODO check the folder recursively + return(TRUE) + } return(FALSE) } else if(length(dirs) == 1) { if(substr(dirs, nchar(dirs)-8,nchar(dirs)) != ".metadata") { diff --git a/R/names.r b/R/names.r index 05d047d..9f4393a 100644 --- a/R/names.r +++ b/R/names.r @@ -25,17 +25,17 @@ names.disk.frame <- function(x, ...) { #' @export colnames.disk.frame <- function(x, ...) { res = attr(x, "path", exact=TRUE) %>% - list.files(full.names = TRUE) + list.files(full.names = TRUE, recursive=TRUE, pattern = "fst") - if(is.null(attr(x, "recordings"))) { + # if(is.null(attr(x, "recordings"))) { if(length(res) == 0) { return(vector("character")) } return(fst::metadata_fst(res[1])$columnNames) - } else { - tiny_example_data.frame = get_chunk(x, 1, from=1, to=1) - return(colnames(tiny_example_data.frame)) - } + # } else { + # tiny_example_data.frame = get_chunk(x, 1, from=1, to=1) + # return(colnames(tiny_example_data.frame)) + # } } diff --git a/R/nchunks.r b/R/nchunks.r index d40eb6e..17c8988 100644 --- a/R/nchunks.r +++ b/R/nchunks.r @@ -36,7 +36,7 @@ nchunks.disk.frame <- function(df, skip.ready.check = FALSE, ...) { #if(!skip.ready.check) stopifnot(is_ready(df)) fpath <- attr(df,"path", exact=TRUE) if(is.dir.disk.frame(df)) { - return(length(fs::dir_ls(fpath, type="file"))) + return(length(list.files(fpath, pattern="fst", recursive = TRUE))) } else { return(1) } diff --git a/R/ncol-nrow.r b/R/ncol-nrow.r index 1addc2d..3f9e17c 100644 --- a/R/ncol-nrow.r +++ b/R/ncol-nrow.r @@ -29,7 +29,7 @@ nrow.disk.frame <- function(df, ...) { stopifnot(is_ready(df)) path1 <- attr(df,"path", exact=TRUE) if(dir.exists(path1)) { - path2 <- list.files(path1,full.names = TRUE) + path2 <- list.files(path1, full.names = TRUE, recursive=TRUE, pattern = "fst") if(length(path2) == 0) { return(0) } diff --git a/R/overwrite_check.r b/R/overwrite_check.r index f065131..eabcf92 100644 --- a/R/overwrite_check.r +++ b/R/overwrite_check.r @@ -15,7 +15,6 @@ #' # clean up #' fs::dir_delete(tf) overwrite_check <- function(outdir, overwrite) { - ##browser if (is.null(outdir)) { warning("outdir is NULL; no overwrite check is performed") return(NULL) @@ -30,7 +29,7 @@ overwrite_check <- function(outdir, overwrite) { fs::dir_delete(outdir) }, error = function(e) { message(e) - stop(glue::glue("Failed to delete the directory {outdir} in preparation for overwrite, this could be due to many reason and may be a genuine bug. Firstly, though, please ensure you do not have the folder open by Explorer (Windows) or other file management systems")) + stop(glue::glue("Failed to delete the directory {outdir} in preparation for overwrite, this could be due to many reasons and may be a genuine bug. Firstly, though, please ensure you do not have the folder open by Explorer (Windows) or other file management systems")) }) } diff --git a/R/partition-filter.r b/R/partition-filter.r new file mode 100644 index 0000000..6d754f2 --- /dev/null +++ b/R/partition-filter.r @@ -0,0 +1,13 @@ +#' Filter the dataset based on folder partitions +#' @param x a disk.frame +#' @param ... filtering conditions for filtering the disk.frame at (folder) partition level +#' @importFrom dplyr filter +#' @export +partition_filter <- function(x, ...) { + expr = bquote(dplyr::filter(dataframe, .(substitute(...)))) + globals = find_globals_recursively(expr, parent.frame()) + + attr(x, "partition_filter") = list(expr=expr, globals=globals) + + return(x) +} diff --git a/R/recommend_nchunks.r b/R/recommend_nchunks.r index b8cd560..bb59aec 100644 --- a/R/recommend_nchunks.r +++ b/R/recommend_nchunks.r @@ -23,7 +23,6 @@ #' # recommend nchunks based on file size ONLY CSV is implemented at the moment #' recommend_nchunks(1024^3) recommend_nchunks <- function(df, type = "csv", minchunks = data.table::getDTthreads(), conservatism = 8, ram_size = df_ram_size()) { - dfsize = 0 if ("data.frame" %in% class(df)) { # the df's size in gigabytes diff --git a/R/write_disk.frame.r b/R/write_disk.frame.r index cdfd0c3..96a8784 100644 --- a/R/write_disk.frame.r +++ b/R/write_disk.frame.r @@ -7,10 +7,8 @@ #' @param nchunks number of chunks #' @param overwrite overwrite output directory #' @param shardby the columns to shard by +#' @param partitionby the columns to (folder) partition by #' @param compress compression ratio for fst files -#' @param shardby_function splitting of chunks: "hash" for hash function or "sort" for semi-sorted chunks -#' @param sort_splits for the "sort" shardby function, a dataframe with the split values. -#' @param desc_vars for the "sort" shardby function, the variables to sort descending. #' @param ... passed to cmap.disk.frame #' @export #' @import fst fs @@ -33,18 +31,67 @@ write_disk.frame <- function( nchunks.disk.frame(diskf), recommend_nchunks(diskf)), overwrite = FALSE, - shardby=NULL, compress = 50, shardby_function="hash", sort_splits=NULL, desc_vars=NULL, ...) { + shardby=NULL, + partitionby=NULL, + compress = 50, ...) { force(nchunks) overwrite_check(outdir, overwrite) - if(is.null(outdir)) { stop("write_disk.frame error: outdir must not be NULL") } if(is_disk.frame(diskf)) { - if(is.null(shardby)) { + if(!is.null(partitionby)) { + + # for each chunk group by the partionby and then write out a partitioned disk.frame for each chunk + list_of_paths = diskf %>% + cimap(~{ + tmp_dir_to_write = tempfile(as.character(.y)) + tmp = .x %>% + group_by(!!!syms(partitionby)) %>% + group_map(~{ + # convert group keys to path + tmp_path = lapply(names(.y), function(n) { + sprintf("%s=%s", n, .y[, n]) + }) %>% + do.call(file.path, .) + + final_tmp_path = file.path(tmp_dir_to_write, tmp_path) + as.disk.frame(.x, final_tmp_path, overwrite = FALSE) + }) + return(tmp_dir_to_write) + }, lazy=FALSE) + + # for each of the chunks, do a soft row-append + partitioned_files = lapply(list_of_paths, function(path) { + # each path is a partitioned disk.frame + files = list.files(path, full.names = TRUE, recursive=TRUE) + tmp = data.frame(partition_path = files %>% + dirname %>% + sapply(tools::file_path_as_absolute) %>% + stringr::str_sub(nchar(path)+2)) + tmp = tmp %>% mutate(path=path, files=files) + + tmp + }) %>% rbindlist + + partitioned_files %>% + group_by(partition_path) %>% + group_map(function(df, grp) { + mapply(function(file, i) { + outfile = file.path(outdir, grp$partition_path, paste0(i, ".fst")) + if(!dir.exists(file.path(outdir, grp$partition_path))) { + fs::dir_create(file.path(outdir, grp$partition_path)) + } + fs::file_move(file, outfile) + }, df$files, seq_along(df$files)) + }) + + return(disk.frame(outdir)) + + } else if(is.null(shardby)) { path = attr(diskf, "path") files_shortname <- list.files(path) cids = get_chunk_ids(diskf, full.names = T, strip_extension = F) diff --git a/man/get_chunk.Rd b/man/get_chunk.Rd index a4b4b79..8e144c6 100644 --- a/man/get_chunk.Rd +++ b/man/get_chunk.Rd @@ -7,7 +7,7 @@ \usage{ get_chunk(...) -\method{get_chunk}{disk.frame}(df, n, keep = NULL, full.names = FALSE, ...) +\method{get_chunk}{disk.frame}(df, n, keep = NULL, full.names = FALSE, ..., partitioned_info = NULL) } \arguments{ \item{...}{passed to fst::read_fst or whichever read function is used in the backend} From 228b4ab530f4243d704a13dbafa176dfce0d7a05 Mon Sep 17 00:00:00 2001 From: ZJ Dai Date: Wed, 2 Feb 2022 21:53:19 +1100 Subject: [PATCH 10/16] moved docs around to other repos --- man/get_partition_paths.Rd | 11 + man/partition_filter.Rd | 16 ++ man/split_string_into_df.Rd | 11 + man/write_disk.frame.Rd | 12 +- manuscript/Book.txt | 2 - .../taming-larger-than-ram-data-course.txt | 2 - presentation/.gitignore | 2 - .../rstudio conf 2021/1min video script | 11 - .../rstudio conf 2021/Abstract proposal.md | 5 - presentation/twin cities/.gitignore | 1 - .../user 2020/user-2020-tut-proposal.Rmd | 103 --------- tests/testthat/test-names.r | 14 +- tests/testthat/test-partitions.r | 24 +++ tutorials/0_feature_tools_drake.r | 20 -- tutorials/1_feature_tools.r | 16 -- tutorials/2_feature_tools.r | 67 ------ tutorials/2_feature_tools_drake.r | 63 ------ tutorials/2_feature_tools_drake.rmd | 60 ------ tutorials/2_feature_tools_no_plo.r | 52 ----- tutorials/30-g.R | 49 ----- tutorials/bigreadr.R | 59 ----- tutorials/cran-logs.r | 78 ------- tutorials/fannie-mae-to-parquet/0-setup.r | 15 -- tutorials/fannie-mae-to-parquet/1-testing.r | 7 - tutorials/fannie_mae/00_setup.r | 66 ------ tutorials/fannie_mae/01a_read_from_csv.r | 40 ---- .../01a_read_from_csv_SINGLE_THREADED.r | 37 ---- tutorials/fannie_mae/01d_OPTIONAL_rechunk.r | 5 - tutorials/fannie_mae/01e_read_data_for_harp.r | 85 -------- .../01f_append_harp_to_data_DEPRECATED.r | 40 ---- .../fannie_mae/01g_compress_it_DEPRECATED.r | 10 - tutorials/fannie_mae/02_exploratory.r | 19 -- .../02a_create_forward_looking_flag.r | 97 --------- tutorials/fannie_mae/02b_foverlaps.r | 45 ---- .../fannie_mae/02b_foverlaps_Reduced16.r | 68 ------ tutorials/fannie_mae/02c_plot_odr.r | 45 ---- tutorials/fannie_mae/02d_harp_check.r | 33 --- tutorials/fannie_mae/10_a_one_var.R | 191 ----------------- tutorials/fannie_mae/10_a_one_var_with_fn.R | 56 ----- tutorials/fannie_mae/10_b_two_vars.r | 61 ------ .../10a_AutoML_XGBoost_Scorecards.r | 90 -------- .../fannie_mae/10b_make_scorecard_visible.R | 109 ---------- tutorials/fannie_mae/11_speedglm.R | 50 ----- tutorials/fannie_mae/12_keras.r | 178 ---------------- tutorials/fannie_mae/13_disk.r | 25 --- tutorials/fannie_mae/3a_get_some_data.r | 113 ---------- tutorials/fannie_mae/4a_read_appl.r | 57 ----- tutorials/fannie_mae/4b_exploratory.R | 92 -------- tutorials/fannie_mae/4c_explore_xgboost.r | 44 ---- tutorials/fannie_mae/4d_xgboost.r | 151 ------------- tutorials/fannie_mae/5a_appl_model.r | 53 ----- tutorials/fannie_mae/5b_one_var.r | 189 ---------------- tutorials/fannie_mae/5c_using_a_fn.r | 54 ----- tutorials/fannie_mae/5d_AutoML.r | 34 --- tutorials/fannie_mae/5e_AutoML_actual.r | 90 -------- .../fannie_mae/5f_make_scorecard_visible.R | 94 -------- tutorials/fannie_mae/6a_origination_year.r | 31 --- tutorials/fannie_mae/7a_speedglm.r | 75 ------- tutorials/fannie_mae/8a_keras.r | 153 ------------- tutorials/fannie_mae_10pct/00_setup.r | 44 ---- .../fannie_mae_10pct/01a_read_from_csv.r | 47 ---- .../01d_a_rbind_all_data_together.r | 31 --- .../fannie_mae_10pct/01d_b_OPTIONAL_rechunk.r | 16 -- .../02a_create_forward_looking_flag.r | 65 ------ ..._forward_looking_flag_4credit_risk_forum.r | 47 ---- tutorials/fannie_mae_10pct/02b_foverlaps.r | 29 --- tutorials/fannie_mae_10pct/02c_plot_odr.r | 34 --- tutorials/fannie_mae_10pct/10_a_one_var.R | 55 ----- .../fannie_mae_10pct/10_a_one_var_with_fn.R | 56 ----- tutorials/fannie_mae_10pct/10_b_rounded.R | 123 ----------- tutorials/fannie_mae_10pct/10_b_two_vars.r | 61 ------ tutorials/fannie_mae_10pct/10_c_two_vars.R | 64 ------ .../10_e_AutoML_XGBoost_Scorecards.r | 91 -------- .../10_f_make_scorecard_visible.R | 109 ---------- .../10a_AutoML_XGBoost_Scorecards.r | 90 -------- .../10b_make_scorecard_visible.R | 109 ---------- tutorials/fannie_mae_10pct/11_speedglm.R | 50 ----- tutorials/fannie_mae_10pct/12_keras.r | 178 ---------------- tutorials/fannie_mae_new/00_setup.r | 66 ------ tutorials/fannie_mae_new/01a_read_from_csv.r | 40 ---- .../fannie_mae_new/01d_OPTIONAL_rechunk.r | 5 - .../fannie_mae_new/01e_read_data_for_harp.r | 85 -------- tutorials/fannie_mae_new/02_exploratory.r | 19 -- .../02a_create_forward_looking_flag.r | 97 --------- tutorials/fannie_mae_new/02b_foverlaps.r | 45 ---- .../fannie_mae_new/02b_foverlaps_Reduced16.r | 68 ------ tutorials/fannie_mae_new/02c_plot_odr.r | 45 ---- tutorials/fannie_mae_new/02d_harp_check.r | 33 --- tutorials/fannie_mae_new/10_a_one_var.R | 191 ----------------- .../fannie_mae_new/10_a_one_var_with_fn.R | 56 ----- tutorials/fannie_mae_new/10_b_two_vars.r | 61 ------ .../10a_AutoML_XGBoost_Scorecards.r | 90 -------- .../10b_make_scorecard_visible.R | 109 ---------- tutorials/fannie_mae_new/11_speedglm.R | 50 ----- tutorials/fannie_mae_new/12_keras.r | 178 ---------------- tutorials/fannie_mae_new/13_disk.r | 25 --- tutorials/fannie_mae_new/3a_get_some_data.r | 113 ---------- tutorials/fannie_mae_new/4a_read_appl.r | 59 ----- tutorials/fannie_mae_new/4b_exploratory.R | 92 -------- tutorials/fannie_mae_new/4c_explore_xgboost.r | 44 ---- tutorials/fannie_mae_new/4d_xgboost.r | 151 ------------- tutorials/fannie_mae_new/5a_appl_model.r | 53 ----- tutorials/fannie_mae_new/5b_one_var.r | 189 ---------------- tutorials/fannie_mae_new/5c_using_a_fn.r | 54 ----- tutorials/fannie_mae_new/5d_AutoML.r | 34 --- tutorials/fannie_mae_new/5e_AutoML_actual.r | 90 -------- .../5f_make_scorecard_visible.R | 94 -------- .../fannie_mae_new/6a_origination_year.r | 31 --- tutorials/fannie_mae_new/7a_speedglm.r | 75 ------- tutorials/fannie_mae_new/8a_keras.r | 153 ------------- tutorials/flights-100-1000.r | 49 ----- .../flights_case_study/0-download-data.sh | 7 - .../flights_case_study/1-make-disk-frame.r | 39 ---- tutorials/flights_case_study/2-shard-check.r | 46 ---- tutorials/flights_case_study/2-shard-check2.r | 31 --- tutorials/flights_case_study/testing.r | 67 ------ tutorials/flights_case_study/testing2.r | 31 --- tutorials/nyc-taxi/nyc-taxi.Rmd | 70 ------ tutorials/readme.r | 94 -------- tutorials/surf_2019_02_demo/.gitignore | 5 - tutorials/surf_2019_02_demo/00_setup.r | 41 ---- .../surf_2019_02_demo/01_surf_201902.rmd | 131 ------------ .../surf_2019_02_demo/10_d_Fannie_mae_1_8b.r | 6 - .../10_d_Fannie_mae_1_8b.rmd | 54 ----- .../10_e_AutoML_XGBoost_Scorecards.r | 112 ---------- .../10_f_make_scorecard_visible.R | 118 ---------- tutorials/surf_2019_02_demo/11_speedglm.R | 51 ----- tutorials/surf_2019_02_demo/12_keras.r | 180 ---------------- .../13_logistic_regression.r | 201 ------------------ .../surf_2019_02_demo/surf_2019_02_demo.ipynb | 68 ------ tutorials/tutorial.r | 124 ----------- .../useR! 2020 Tutorial part 1/.gitignore | 1 - .../useR! 2020 Tutorial part 1.Rmd | 147 ------------- tutorials/vs-vaex/ok.py | 5 - 134 files changed, 72 insertions(+), 8680 deletions(-) create mode 100644 man/get_partition_paths.Rd create mode 100644 man/partition_filter.Rd create mode 100644 man/split_string_into_df.Rd delete mode 100644 manuscript/Book.txt delete mode 100644 manuscript/taming-larger-than-ram-data-course.txt delete mode 100644 presentation/.gitignore delete mode 100644 presentation/rstudio conf 2021/1min video script delete mode 100644 presentation/rstudio conf 2021/Abstract proposal.md delete mode 100644 presentation/twin cities/.gitignore delete mode 100644 presentation/user 2020/user-2020-tut-proposal.Rmd create mode 100644 tests/testthat/test-partitions.r delete mode 100644 tutorials/0_feature_tools_drake.r delete mode 100644 tutorials/1_feature_tools.r delete mode 100644 tutorials/2_feature_tools.r delete mode 100644 tutorials/2_feature_tools_drake.r delete mode 100644 tutorials/2_feature_tools_drake.rmd delete mode 100644 tutorials/2_feature_tools_no_plo.r delete mode 100644 tutorials/30-g.R delete mode 100644 tutorials/bigreadr.R delete mode 100644 tutorials/cran-logs.r delete mode 100644 tutorials/fannie-mae-to-parquet/0-setup.r delete mode 100644 tutorials/fannie-mae-to-parquet/1-testing.r delete mode 100644 tutorials/fannie_mae/00_setup.r delete mode 100644 tutorials/fannie_mae/01a_read_from_csv.r delete mode 100644 tutorials/fannie_mae/01a_read_from_csv_SINGLE_THREADED.r delete mode 100644 tutorials/fannie_mae/01d_OPTIONAL_rechunk.r delete mode 100644 tutorials/fannie_mae/01e_read_data_for_harp.r delete mode 100644 tutorials/fannie_mae/01f_append_harp_to_data_DEPRECATED.r delete mode 100644 tutorials/fannie_mae/01g_compress_it_DEPRECATED.r delete mode 100644 tutorials/fannie_mae/02_exploratory.r delete mode 100644 tutorials/fannie_mae/02a_create_forward_looking_flag.r delete mode 100644 tutorials/fannie_mae/02b_foverlaps.r delete mode 100644 tutorials/fannie_mae/02b_foverlaps_Reduced16.r delete mode 100644 tutorials/fannie_mae/02c_plot_odr.r delete mode 100644 tutorials/fannie_mae/02d_harp_check.r delete mode 100644 tutorials/fannie_mae/10_a_one_var.R delete mode 100644 tutorials/fannie_mae/10_a_one_var_with_fn.R delete mode 100644 tutorials/fannie_mae/10_b_two_vars.r delete mode 100644 tutorials/fannie_mae/10a_AutoML_XGBoost_Scorecards.r delete mode 100644 tutorials/fannie_mae/10b_make_scorecard_visible.R delete mode 100644 tutorials/fannie_mae/11_speedglm.R delete mode 100644 tutorials/fannie_mae/12_keras.r delete mode 100644 tutorials/fannie_mae/13_disk.r delete mode 100644 tutorials/fannie_mae/3a_get_some_data.r delete mode 100644 tutorials/fannie_mae/4a_read_appl.r delete mode 100644 tutorials/fannie_mae/4b_exploratory.R delete mode 100644 tutorials/fannie_mae/4c_explore_xgboost.r delete mode 100644 tutorials/fannie_mae/4d_xgboost.r delete mode 100644 tutorials/fannie_mae/5a_appl_model.r delete mode 100644 tutorials/fannie_mae/5b_one_var.r delete mode 100644 tutorials/fannie_mae/5c_using_a_fn.r delete mode 100644 tutorials/fannie_mae/5d_AutoML.r delete mode 100644 tutorials/fannie_mae/5e_AutoML_actual.r delete mode 100644 tutorials/fannie_mae/5f_make_scorecard_visible.R delete mode 100644 tutorials/fannie_mae/6a_origination_year.r delete mode 100644 tutorials/fannie_mae/7a_speedglm.r delete mode 100644 tutorials/fannie_mae/8a_keras.r delete mode 100644 tutorials/fannie_mae_10pct/00_setup.r delete mode 100644 tutorials/fannie_mae_10pct/01a_read_from_csv.r delete mode 100644 tutorials/fannie_mae_10pct/01d_a_rbind_all_data_together.r delete mode 100644 tutorials/fannie_mae_10pct/01d_b_OPTIONAL_rechunk.r delete mode 100644 tutorials/fannie_mae_10pct/02a_create_forward_looking_flag.r delete mode 100644 tutorials/fannie_mae_10pct/02a_create_forward_looking_flag_4credit_risk_forum.r delete mode 100644 tutorials/fannie_mae_10pct/02b_foverlaps.r delete mode 100644 tutorials/fannie_mae_10pct/02c_plot_odr.r delete mode 100644 tutorials/fannie_mae_10pct/10_a_one_var.R delete mode 100644 tutorials/fannie_mae_10pct/10_a_one_var_with_fn.R delete mode 100644 tutorials/fannie_mae_10pct/10_b_rounded.R delete mode 100644 tutorials/fannie_mae_10pct/10_b_two_vars.r delete mode 100644 tutorials/fannie_mae_10pct/10_c_two_vars.R delete mode 100644 tutorials/fannie_mae_10pct/10_e_AutoML_XGBoost_Scorecards.r delete mode 100644 tutorials/fannie_mae_10pct/10_f_make_scorecard_visible.R delete mode 100644 tutorials/fannie_mae_10pct/10a_AutoML_XGBoost_Scorecards.r delete mode 100644 tutorials/fannie_mae_10pct/10b_make_scorecard_visible.R delete mode 100644 tutorials/fannie_mae_10pct/11_speedglm.R delete mode 100644 tutorials/fannie_mae_10pct/12_keras.r delete mode 100644 tutorials/fannie_mae_new/00_setup.r delete mode 100644 tutorials/fannie_mae_new/01a_read_from_csv.r delete mode 100644 tutorials/fannie_mae_new/01d_OPTIONAL_rechunk.r delete mode 100644 tutorials/fannie_mae_new/01e_read_data_for_harp.r delete mode 100644 tutorials/fannie_mae_new/02_exploratory.r delete mode 100644 tutorials/fannie_mae_new/02a_create_forward_looking_flag.r delete mode 100644 tutorials/fannie_mae_new/02b_foverlaps.r delete mode 100644 tutorials/fannie_mae_new/02b_foverlaps_Reduced16.r delete mode 100644 tutorials/fannie_mae_new/02c_plot_odr.r delete mode 100644 tutorials/fannie_mae_new/02d_harp_check.r delete mode 100644 tutorials/fannie_mae_new/10_a_one_var.R delete mode 100644 tutorials/fannie_mae_new/10_a_one_var_with_fn.R delete mode 100644 tutorials/fannie_mae_new/10_b_two_vars.r delete mode 100644 tutorials/fannie_mae_new/10a_AutoML_XGBoost_Scorecards.r delete mode 100644 tutorials/fannie_mae_new/10b_make_scorecard_visible.R delete mode 100644 tutorials/fannie_mae_new/11_speedglm.R delete mode 100644 tutorials/fannie_mae_new/12_keras.r delete mode 100644 tutorials/fannie_mae_new/13_disk.r delete mode 100644 tutorials/fannie_mae_new/3a_get_some_data.r delete mode 100644 tutorials/fannie_mae_new/4a_read_appl.r delete mode 100644 tutorials/fannie_mae_new/4b_exploratory.R delete mode 100644 tutorials/fannie_mae_new/4c_explore_xgboost.r delete mode 100644 tutorials/fannie_mae_new/4d_xgboost.r delete mode 100644 tutorials/fannie_mae_new/5a_appl_model.r delete mode 100644 tutorials/fannie_mae_new/5b_one_var.r delete mode 100644 tutorials/fannie_mae_new/5c_using_a_fn.r delete mode 100644 tutorials/fannie_mae_new/5d_AutoML.r delete mode 100644 tutorials/fannie_mae_new/5e_AutoML_actual.r delete mode 100644 tutorials/fannie_mae_new/5f_make_scorecard_visible.R delete mode 100644 tutorials/fannie_mae_new/6a_origination_year.r delete mode 100644 tutorials/fannie_mae_new/7a_speedglm.r delete mode 100644 tutorials/fannie_mae_new/8a_keras.r delete mode 100644 tutorials/flights-100-1000.r delete mode 100644 tutorials/flights_case_study/0-download-data.sh delete mode 100644 tutorials/flights_case_study/1-make-disk-frame.r delete mode 100644 tutorials/flights_case_study/2-shard-check.r delete mode 100644 tutorials/flights_case_study/2-shard-check2.r delete mode 100644 tutorials/flights_case_study/testing.r delete mode 100644 tutorials/flights_case_study/testing2.r delete mode 100644 tutorials/nyc-taxi/nyc-taxi.Rmd delete mode 100644 tutorials/readme.r delete mode 100644 tutorials/surf_2019_02_demo/.gitignore delete mode 100644 tutorials/surf_2019_02_demo/00_setup.r delete mode 100644 tutorials/surf_2019_02_demo/01_surf_201902.rmd delete mode 100644 tutorials/surf_2019_02_demo/10_d_Fannie_mae_1_8b.r delete mode 100644 tutorials/surf_2019_02_demo/10_d_Fannie_mae_1_8b.rmd delete mode 100644 tutorials/surf_2019_02_demo/10_e_AutoML_XGBoost_Scorecards.r delete mode 100644 tutorials/surf_2019_02_demo/10_f_make_scorecard_visible.R delete mode 100644 tutorials/surf_2019_02_demo/11_speedglm.R delete mode 100644 tutorials/surf_2019_02_demo/12_keras.r delete mode 100644 tutorials/surf_2019_02_demo/13_logistic_regression.r delete mode 100644 tutorials/surf_2019_02_demo/surf_2019_02_demo.ipynb delete mode 100644 tutorials/tutorial.r delete mode 100644 tutorials/user_2020/useR! 2020 Tutorial part 1/.gitignore delete mode 100644 tutorials/user_2020/useR! 2020 Tutorial part 1/useR! 2020 Tutorial part 1.Rmd delete mode 100644 tutorials/vs-vaex/ok.py diff --git a/man/get_partition_paths.Rd b/man/get_partition_paths.Rd new file mode 100644 index 0000000..5f230f7 --- /dev/null +++ b/man/get_partition_paths.Rd @@ -0,0 +1,11 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/get_partition.r +\name{get_partition_paths} +\alias{get_partition_paths} +\title{Get the partitioning structure of a folder} +\usage{ +get_partition_paths(df) +} +\description{ +Get the partitioning structure of a folder +} diff --git a/man/partition_filter.Rd b/man/partition_filter.Rd new file mode 100644 index 0000000..14a93a4 --- /dev/null +++ b/man/partition_filter.Rd @@ -0,0 +1,16 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/partition-filter.r +\name{partition_filter} +\alias{partition_filter} +\title{Filter the dataset based on folder partitions} +\usage{ +partition_filter(x, ...) +} +\arguments{ +\item{x}{a disk.frame} + +\item{...}{filtering conditions for filtering the disk.frame at (folder) partition level} +} +\description{ +Filter the dataset based on folder partitions +} diff --git a/man/split_string_into_df.Rd b/man/split_string_into_df.Rd new file mode 100644 index 0000000..345afd6 --- /dev/null +++ b/man/split_string_into_df.Rd @@ -0,0 +1,11 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/get_partition.r +\name{split_string_into_df} +\alias{split_string_into_df} +\title{Turn a string of the form /partion1=val/partion2=val2 into data.frame} +\usage{ +split_string_into_df(path_strs) +} +\description{ +Turn a string of the form /partion1=val/partion2=val2 into data.frame +} diff --git a/man/write_disk.frame.Rd b/man/write_disk.frame.Rd index 60c2bf0..c152c85 100644 --- a/man/write_disk.frame.Rd +++ b/man/write_disk.frame.Rd @@ -12,10 +12,8 @@ write_disk.frame( recommend_nchunks(diskf)), overwrite = FALSE, shardby = NULL, + partitionby = NULL, compress = 50, - shardby_function = "hash", - sort_splits = NULL, - desc_vars = NULL, ... ) @@ -32,13 +30,9 @@ output_disk.frame(...) \item{shardby}{the columns to shard by} -\item{compress}{compression ratio for fst files} - -\item{shardby_function}{splitting of chunks: "hash" for hash function or "sort" for semi-sorted chunks} +\item{partitionby}{the columns to (folder) partition by} -\item{sort_splits}{for the "sort" shardby function, a dataframe with the split values.} - -\item{desc_vars}{for the "sort" shardby function, the variables to sort descending.} +\item{compress}{compression ratio for fst files} \item{...}{passed to cmap.disk.frame} } diff --git a/manuscript/Book.txt b/manuscript/Book.txt deleted file mode 100644 index 08609b2..0000000 --- a/manuscript/Book.txt +++ /dev/null @@ -1,2 +0,0 @@ -taming-larger-than-ram-data-course.txt - diff --git a/manuscript/taming-larger-than-ram-data-course.txt b/manuscript/taming-larger-than-ram-data-course.txt deleted file mode 100644 index 864097c..0000000 --- a/manuscript/taming-larger-than-ram-data-course.txt +++ /dev/null @@ -1,2 +0,0 @@ -# Introduction -This course will teach you all the ins and outs of {disk.frame}. The instructor is the author of {disk.frame}. Enjoy! diff --git a/presentation/.gitignore b/presentation/.gitignore deleted file mode 100644 index 0586a46..0000000 --- a/presentation/.gitignore +++ /dev/null @@ -1,2 +0,0 @@ -*.pptx -*.pdf diff --git a/presentation/rstudio conf 2021/1min video script b/presentation/rstudio conf 2021/1min video script deleted file mode 100644 index 5333716..0000000 --- a/presentation/rstudio conf 2021/1min video script +++ /dev/null @@ -1,11 +0,0 @@ -Hi my name is ZedJ and I am a Data Scientist local to Melbourne. I am a keen contributor to open source data science projects, one of which I want to talk about at rstudio:conf 2021. That project is {disk.frame} - a larger-than-RAM data manipulation package. - -R needs to load the data in its entirety into RAM. However, RAM is a precious resource and often do run out. - -{disk.frame} solves this issue by providing a 100%-R framework to manipulate data on disk. A modern laptop with {disk.frame} can comfortably handle 100GB's of data. - -Also, {disk.frame} uses {dplyr} verbs to manipulate data so useRs will find it very easy to pick up. - -Finally, because {disk.frame} is 100%-R, you can use any R package with it at no extra cost unlike Spark. - -The talk I propose will introduce {disk.frame} to users with the needs to manipulate large amounts of data with minimal setup. They will find {disk.frame} very familiar, as {disk.frame} uses {dplyr} verbs directly;. Some users rely on DBMS (e.g. PostgresSQL), Spark, or SAS to manage their large datasets. They will find lots of benefits in switching to {disk.frame}, which will allow them to keep their workflow in R for as long as possible. Because {disk.frame} can run R functions natively, they will find that {disk.frame} allows them to many R packages directly with {disk.frame}. diff --git a/presentation/rstudio conf 2021/Abstract proposal.md b/presentation/rstudio conf 2021/Abstract proposal.md deleted file mode 100644 index e7406d3..0000000 --- a/presentation/rstudio conf 2021/Abstract proposal.md +++ /dev/null @@ -1,5 +0,0 @@ -Learn how to handle 100GBs of data with ease using {disk.frame} - the larger-than-RAM-data manipulation package. - -R loads data in its entirety into RAM. However, RAM is a precious resource and often do run out. That's why most R user would have run into the "cannot allocate vector of size xxB." error at some point. - -However, the need to handle larger-than-RAM data doesn't go away just because RAM isn't large enough. So many useRs turn to big data tools like Spark for the task. In this talk, I will make the case that {disk.frame} is sufficient and often preferable for manipulating larger-than-RAM data that fit on disk. I will show how you can apply familiar {dplyr}-verbs to manipulate larger-than-RAM data with {disk.frame}. \ No newline at end of file diff --git a/presentation/twin cities/.gitignore b/presentation/twin cities/.gitignore deleted file mode 100644 index 2641667..0000000 --- a/presentation/twin cities/.gitignore +++ /dev/null @@ -1 +0,0 @@ -*.mp4 diff --git a/presentation/user 2020/user-2020-tut-proposal.Rmd b/presentation/user 2020/user-2020-tut-proposal.Rmd deleted file mode 100644 index 8345216..0000000 --- a/presentation/user 2020/user-2020-tut-proposal.Rmd +++ /dev/null @@ -1,103 +0,0 @@ ---- -title: "Easy Larger-than-RAM data manipulation with disk.frame - useR! 2020 tutorial proposal" -author: "ZJ" -date: "11/25/2019" -output: pdf_document ---- - -```{r setup, include=FALSE} -knitr::opts_chunk$set(echo = TRUE) -``` - -## Title - -Handling 100GBs of data with ease - Larger-than-RAM data manipulation with disk.frame - -## Audience - -The tutorial aims to introduce {disk.frame} to users with the needs to manipulate large amounts of data. The audience will benefit the most from this tutorial if they are already familiar with popular R data frameworks such as dplyr and {data.table} because {disk.frame} uses dplyr and data.table primitives like `select`, and `dt[i, j, by]`. - -Some users rely on DBMS (e.g. PostgresSQL), Spark, or SAS to manage their large dataset. They will find tremendous benefit in switching to {disk.frame}, which will allow them to keep their workflow in R for as long as possible. - -## Instructor background - -I have more than ten years of experience using R and enterprise data manipulation tools like SAS and SQL. I have experience teaching Data Science courses in R in a group setting. I am the author of the {disk.frame} package, hence I have intimate knowledge of the inner workings of {disk.frame} and is, therefore, well-positioned to teach {disk.frame} for larger-than-RAM data manipulation. - - -The website I created for {disk.frame} is -* https://diskframe.com and - -I have given a useR! 2019 talk on {disk.frame}, see -* https://www.youtube.com/watch?v=3XMTyi_H4q4&t=1s - -My LinkedIn -* https://www.linkedin.com/in/daizj/ - -Here is an R course I wrote to Data Science in general -* https://github.com/xiaodaigh/intro_r_data_science - -## Domain - -Big Data manipulation - efficiently wrangle data that doesn't fit into RAM - - -## Points of appeal (75 words) -Every R user would have run into the "cannot allocate vector of size xxxB." error at some point. For most applications, R needs to load the data in its entirety into RAM. However, RAM is a precious resource and often do run out. - -{disk.frame} solves this issue by providing a framework to manipulate data on disk and minimize RAM usage. By using {disk.frame}, the user can perform analysis on much larger data than is normally possible with R. A modern laptop with {disk.frame} can comfortably handle 100GB's of data. - - -## Learning objectives: 100 – 200 words -In this tutorial, the user will learn how to use {disk.frame} effectively to manipulate datasets up to 100GBs in size on a single computer. - -After the tutorial, the user should be able to - -* learn what the pros and cons of common "big data" technologies like DBMS, Spark, and {disk.frame} are -* understand how {disk.frame} can help with manipulating large amounts of data -* understand when to use {disk.frame} and when not to -* confidently use {disk.frame} for any data manipulation task and not worry about running out of RAM -* load large datasets into {disk.frame} format -* manipulate large {disk.frame} files -* summarize large {disk.frame} files - -## Computing requirement -The requirement for {disk.frame} is R 3.4 and the ability to install packages via `install.packages`. A laptop is advised but not necessary. - -If attendees wish to follow along, I recommend that they should have RStudio or Jupyter notebook set up. However, that is not necessary as I will running through code that they can obtain from Github. - -## Teaching Assistant -I don't think Teaching Assistants are necessary. Installing disk.frame is simply running `install.packages("disk.frame")`. - -However, anyone familiar with RStudio setup and installation of packages is welcome as Teaching Assistant. Any expertise in the installation and setting of Jupyter notebook or Jupyter Lab would be highly appreciated. - -## Lesson Plan -The structure will be very informal, and questions are encouraged at all points along the way. - -* Introduction to disk.frame; how it's structured and why it's fast -* Pros and cons of disk.frame; when to use disk.frame and when not to -* Follow along: -* generate a fake dataset that is reasonably large (so avoiding a large download from internet) -* Convert the dataset to CSV -* Show how to load the data from CSV to {disk.frame} -* Manipulate the {disk.frame} -* Do group-by and summary statistics with {disk.frame} -* show how dplyr and data.table syntax can operate on {disk.frame} -* Explain how to perform group-by efficiently -* Demonstrate how joining works for large {disk.frame}s - -**** break **** - -After the break, we will cover advanced {disk.frame} topics - -* Explain how to extend {disk.frame} with a custom function -* Explain how to tune {disk.frame} for maximum efficiency -* Show some convenience features for {disk.frame} in RStudio if time permits -* Conclusion and Q&A - -## Other Considerations -The user can consult https://diskframe.com. It is regularly updated with relevant {disk.frame} resources like articles and how-to-guides. - -The intention is to show how to manipulate larger-than-RAM data with {disk.frame}. However, we will not download any such data from the internet. Instead, we will use an existing function to generate such data. Hence, it will not pose any constraint on internet access nor speed. However, students wishing to follow along on laptops will need to install {disk.frame} via `install.packages("disk.frame")`. - - - diff --git a/tests/testthat/test-names.r b/tests/testthat/test-names.r index 2d114ae..6cc6502 100644 --- a/tests/testthat/test-names.r +++ b/tests/testthat/test-names.r @@ -12,13 +12,13 @@ test_that("testing names", { expect_setequal(names(b), c("a","b")) }) -test_that("testing names with lazyfn", { - b = disk.frame(file.path(tempdir(), "tmp_names.df")) %>% - mutate(d = a + b) - - expect_setequal(colnames(b), c("a","b", "d")) - expect_setequal(names(b), c("a","b", "d")) -}) +# test_that("testing names with lazyfn", { +# b = disk.frame(file.path(tempdir(), "tmp_names.df")) %>% +# mutate(d = a + b) +# +# expect_setequal(colnames(b), c("a","b", "d")) +# expect_setequal(names(b), c("a","b", "d")) +# }) teardown({ fs::dir_delete(file.path(tempdir(), "tmp_names.df")) diff --git a/tests/testthat/test-partitions.r b/tests/testthat/test-partitions.r new file mode 100644 index 0000000..3fb65f2 --- /dev/null +++ b/tests/testthat/test-partitions.r @@ -0,0 +1,24 @@ +context("test-partitions") + +setup({ + setup_disk.frame(workers = 1) +}) + +test_that("test partitions", { + a = as.disk.frame(cars) %>% + write_disk.frame(partitionby="speed", outdir=tempfile()) + + a = a %>% + partition_filter(speed < 10) %>% + collect %>% + arrange(speed, dist) + + + b = cars %>% + filter(speed < 10) %>% + arrange(speed, dist) + + for(n in names(a)) { + expect_equal(a %>% select(!!n), b %>% select(!!n)) + } +}) diff --git a/tutorials/0_feature_tools_drake.r b/tutorials/0_feature_tools_drake.r deleted file mode 100644 index 2ad7388..0000000 --- a/tutorials/0_feature_tools_drake.r +++ /dev/null @@ -1,20 +0,0 @@ -library(disk.frame) -setup_disk.frame() - -library(drake) - -plan = drake::drake_plan( - df = target( - disk.frame::csv_to_disk.frame( - file_in("D:/data/mortgage-risk/feature_matrix_cleaned.csv"), - outdir = drake::drake_tempfile() - ), - format = "diskframe" - ) -) - -system.time(make(plan)) - -loadd(df) - -df diff --git a/tutorials/1_feature_tools.r b/tutorials/1_feature_tools.r deleted file mode 100644 index e5bd328..0000000 --- a/tutorials/1_feature_tools.r +++ /dev/null @@ -1,16 +0,0 @@ -library(disk.frame) -# this willl set disk.frame with multiple workers -setup_disk.frame() -# this will allow unlimited amount of data to be passed from worker to worker -options(future.globals.maxSize = Inf) - - -system.time( - a <- - csv_to_disk.frame( - "c:/data/feature_matrix_cleaned.csv", - in_chunk_size = 1e5/4 - ) -) - -system.time(rechunk(a, 16, outdir = "c:/data/ft.df", shardby="sk_id_curr")) diff --git a/tutorials/2_feature_tools.r b/tutorials/2_feature_tools.r deleted file mode 100644 index 88b3c08..0000000 --- a/tutorials/2_feature_tools.r +++ /dev/null @@ -1,67 +0,0 @@ -library(disk.frame) -library(data.table) -library(drake) -# this willl set disk.frame with multiple workers -setup_disk.frame() -# this will allow unlimited amount of data to be passed from worker to worker -options(future.globals.maxSize = Inf) - -loadd(gd) - -system.time(corgd = cor(gd)) - -library(data.table) - -system.time( - a <- - data.table::fread( - "C:/Users/ZJ.DAI/Documents/git/disk.frame/tutorials/gd.csv")) - - -if(nchunks(df) != 32) { - rechunk(df, 32, shardby= "sk_id_curr") -} - -# rechunk(df, 32) -plan <- drake::drake_plan( - df = disk.frame(file_in("d:/data/ft.df")), - df = { - if(nchunks(df) != 32) { - rechunk(df, 32, shardby= "sk_id_curr") - } - }, - bads = df %>% - filter(target == 1) %>% - collect(parallel=FALSE) -) - - - -system.time() - -sfrac = nrow(bads)/(nrow(df) - nrow(bads)) - -# sfrac = 0.07490269 -system.time(df1 <- df %>% - filter(target == 0) %>% - sample_frac(size = sfrac) %>% - collect(parallel = FALSE)) - -goods = df1 - -gd = rbindlist(list(bads, goods)) -gd - - -pryr::object_size(gd) -arrow::write_parquet(gd, "gd.parquet") -fst::write_fst(gd, "gd.fst") -data.table::fwrite(gd, "gd.csv") - -sample_frac(gd, 0.25) %>% - data.table::fwrite("gd.sample.csv") - -gd = fst::read_fst("gd.fst") -library(DataExplorer) -DataExplorer::create_report(gd) - diff --git a/tutorials/2_feature_tools_drake.r b/tutorials/2_feature_tools_drake.r deleted file mode 100644 index a8cb864..0000000 --- a/tutorials/2_feature_tools_drake.r +++ /dev/null @@ -1,63 +0,0 @@ -library(disk.frame) -library(data.table) -library(drake) -library(DataExplorer) -# this willl set disk.frame with multiple workers -setup_disk.frame() -# this will allow unlimited amount of data to be passed from worker to worker -options(future.globals.maxSize = Inf) - -# rechunk(df, 32) -plan <- drake::drake_plan( - df1 = { - df1 = disk.frame(file_in("d:/data/ft.df")) - if(nchunks(df1) != 32) { - rechunk(df1, 32, shardby= "sk_id_curr") - } - df1 - }, - bads = df1 %>% - filter(target == 1) %>% - collect(parallel=FALSE), - sfrac = nrow(bads)/(nrow(df1) - nrow(bads)), - goods = - df1 %>% - filter(target == 0) %>% - sample_frac(size = sfrac) %>% - collect(parallel = FALSE), - gd = target( - rbindlist(list(bads, goods)), - format = "fst" - ), - arrow::write_parquet(gd, file_out("gd.parquet")), - data.table::fwrite(gd, file_out("gd.csv")), - gd_sample = sample_frac(gd, 0.25), - data.table::fwrite(gd_sample, file_out("gd.sample.csv"))#, - # intro_plot = DataExplorer::create_report( - # gd, - # output_file = file_out("output/dataexplorer/plot_intro.html") - # ) -) - -make(plan) - -loadd(gd) - -intro_plot = DataExplorer::create_report( - gd, - output_file = file_out("output/dataexplorer/plot_intro.html") -) - -build_times(plan) -# -# profile_missing(gd) ->pmgd -# -# plot_histogram(pmgd) -# -# -# intro_plot = DataExplorer::create_report( -# gd, -# configure_cache() -# ) -# -# introduce(gd) diff --git a/tutorials/2_feature_tools_drake.rmd b/tutorials/2_feature_tools_drake.rmd deleted file mode 100644 index cdbdc84..0000000 --- a/tutorials/2_feature_tools_drake.rmd +++ /dev/null @@ -1,60 +0,0 @@ -```{r} -library(disk.frame) -library(data.table) -library(drake) -library(DataExplorer) -# this willl set disk.frame with multiple workers -setup_disk.frame() -# this will allow unlimited amount of data to be passed from worker to worker -options(future.globals.maxSize = Inf) -``` - -```{r} -# rechunk(df, 32) -plan <- drake::drake_plan( - df1 = { - df1 = disk.frame(file_in("d:/data/ft.df")) - if(nchunks(df1) != 32) { - rechunk(df1, 32, shardby= "sk_id_curr") - } - df1 - }, - bads = df1 %>% - filter(target == 1) %>% - collect(parallel=FALSE), - sfrac = nrow(bads)/(nrow(df1) - nrow(bads)), - goods = - df1 %>% - filter(target == 0) %>% - sample_frac(size = sfrac) %>% - collect(parallel = FALSE), - gd = target( - rbindlist(list(bads, goods)), - format = "fst" - ), - arrow::write_parquet(gd, file_out("gd.parquet")), - data.table::fwrite(gd, file_out("gd.csv")), - gd_sample = sample_frac(gd, 0.25), - data.table::fwrite(gd_sample, file_out("gd.sample.csv")), - intro_plot = DataExplorer::create_report( - gd, - output_file = file_out("output/dataexplorer/plot_intro.html") - ) -) -system.time(make(plan)) -``` - -```{r} -print(getwd()) - -``` - - -```{r} -loadd(gd) -gd -``` - -``{r} -build_times(plan) -``` diff --git a/tutorials/2_feature_tools_no_plo.r b/tutorials/2_feature_tools_no_plo.r deleted file mode 100644 index a96011c..0000000 --- a/tutorials/2_feature_tools_no_plo.r +++ /dev/null @@ -1,52 +0,0 @@ -library(disk.frame) -library(data.table) -library(drake) -library(DataExplorer) -# this willl set disk.frame with multiple workers -setup_disk.frame() -# this will allow unlimited amount of data to be passed from worker to worker -options(future.globals.maxSize = Inf) - -# rechunk(df, 32) -plan <- drake::drake_plan( - df1 = { - df1 = disk.frame(file_in("d:/data/ft.df")) - if(nchunks(df1) != 32) { - rechunk(df1, 32, shardby= "sk_id_curr") - } - df1 - }, - bads = df1 %>% - filter(target == 1) %>% - collect(parallel=FALSE), - sfrac = nrow(bads)/(nrow(df1) - nrow(bads)), - goods = - df1 %>% - filter(target == 0) %>% - sample_frac(size = sfrac) %>% - collect(parallel = FALSE), - gd = target( - rbindlist(list(bads, goods)), - format = "fst" - ), - arrow::write_parquet(gd, file_out("gd.parquet")), - data.table::fwrite(gd, file_out("gd.csv")), - gd_sample = sample_frac(gd, 0.25), - data.table::fwrite(gd_sample, file_out("gd.sample.csv")) -) - -make(plan) - -loadd(gd) -# -# profile_missing(gd) ->pmgd -# -# plot_histogram(pmgd) -# -# -# intro_plot = DataExplorer::create_report( -# gd, -# configure_cache() -# ) -# -# introduce(gd) diff --git a/tutorials/30-g.R b/tutorials/30-g.R deleted file mode 100644 index 7b5a9ee..0000000 --- a/tutorials/30-g.R +++ /dev/null @@ -1,49 +0,0 @@ -library(disk.frame) - -setup_disk.frame(12) - -a = disk.frame("c:/data/airontimecsv.df/") - -system.time(b <- a %>% - group_by(YEAR, MONTH, DAY_OF_MONTH) %>% - summarise(sum(DEP_DELAY)) %>% - collect) - - -path_to_data <- "c:/data/" -rows = 148619656 -recommended_nchunks = recommend_nchunks(file.size(file.path(path_to_data, "combined.csv"))) -in_chunk_size = ceiling(rows/ recommended_nchunks) - -path_to_data = "c:/data/AirOnTimeCSV/" -#path_to_data = "d:/data/" -system.time(a <- csv_to_disk.frame( - list.files(path_to_data, pattern = ".csv$", full.names = TRUE), - outdir = file.path("c:/data/", "airontimecsv.df"), - colClasses = list(character = c("WHEELS_OFF", "WHEELS_ON")) -)) - - -system.time(flights.df <- csv_to_disk.frame( - paste0(path_to_data, "combined.csv"), - outdir = paste0(path_to_data, "combined.laf.df"), - in_chunk_size = in_chunk_size, - backend = "LaF" -)) - -system.time(a <- csv_to_disk.frame( - file.path(path_to_data, "combined.csv"), - outdir = file.path(path_to_data, "combined.readr.df"), - in_chunk_size = in_chunk_size, - colClasses = list(character = c("WHEELS_OFF","WHEELS_ON")), - chunk_reader = "readr" -)) - -system.time(a <- csv_to_disk.frame( - file.path(path_to_data, "combined.csv"), - outdir = file.path(path_to_data, "combined.readLines.df"), - in_chunk_size = in_chunk_size, - colClasses = list(character = c("WHEELS_OFF","WHEELS_ON")), - chunk_reader = "readLines" -)) - diff --git a/tutorials/bigreadr.R b/tutorials/bigreadr.R deleted file mode 100644 index c2deac1..0000000 --- a/tutorials/bigreadr.R +++ /dev/null @@ -1,59 +0,0 @@ -library(disk.frame) - -setup_disk.frame() - -system.time(b <- csv_to_disk.frame( - "c:/data/combined.csv", - colClasses = list(character = c("WHEELS_ON", "WHEELS_OFF")), - in_chunk_size = 4e6, - chunk_reader = "bigreadr" -)) - -system.time(nl <- bigreadr::nlines("c:/data/combined.csv")) -system.time(a <- bigreadr::split_file( - "c:/data/combined.csv", - every_nlines = 4e6, - repeat_header = TRUE)) - -system.time(b <- csv_to_disk.frame( - bigreadr::get_split_files(a), - header = TRUE, - colClasses = list(character = c("WHEELS_ON", "WHEELS_OFF")) -)) - - - -#' system(' -#' @echo off -#' Setlocal EnableDelayedExpansion -#' for /f "usebackq" %%a in (`dir /b %1`) do ( -#' for /f "usebackq" %%b in (`type %%a ^| find "" /v /c`) do ( -#' set /a lines += %%b -#' ) -#' ) -#' echo %lines% -#' endlocal -#' -#' CountLines "c:\data\combined.csv" -#' ') - -a = disk.frame(file.path(path_to_data, "airontimecsv.df")) - - -system.time(r_mean_del_delay <- a %>% - group_by(YEAR, MONTH, DAY_OF_MONTH) %>% - summarise(sum_delay = sum(DEP_DELAY, na.rm = TRUE), n = n()) %>% - collect %>% - group_by(YEAR, MONTH, DAY_OF_MONTH) %>% - summarise(mean_delay = sum(sum_delay)/sum(n))) - - -library(lubridate) - -dep_delay = r_mean_del_delay %>% - arrange(YEAR, MONTH, DAY_OF_MONTH) %>% - mutate(date = ymd(paste(YEAR, MONTH, DAY_OF_MONTH, sep = "-"))) - -library(ggplot2) -ggplot(dep_delay, aes(date, mean_delay)) + geom_smooth() - diff --git a/tutorials/cran-logs.r b/tutorials/cran-logs.r deleted file mode 100644 index 3628e51..0000000 --- a/tutorials/cran-logs.r +++ /dev/null @@ -1,78 +0,0 @@ -a = data.table::fread("c:/Users/ZJ.DAI/Downloads/a (1).csv") - -cran.stats::read_logs(start=as.Date("2019-08-27"), as.Date("2019-08-27"), dir = "d:/data/cran-logs") - -library(dplyr) -library(data.table) -a[,date:=as.Date(date, "%Y-%m-%d")] - -a1 = a %>% - # filter(N < 100) %>% - right_join(data.frame(date = seq(min(a$date), max(a$date), by="day"))) - -setDT(a1) -a1[,N := ifelse(is.na(N), (lag(N) + lead(N))/2, N)] -a2 = a1[!is.na(N)] -fwrite(a2, "c:/data/a1.csv", row.names = FALSE) - -a3 = cbind(a2, a2[,shift(N,-3:3, type = "lead")] %>% - apply(1,mean)) %>% - filter(!is.na(V2)) %>% - select(-N) -fwrite(a3, "c:/data/a3.csv", row.names = FALSE) - -plot(a3) - -library(pixapi) - -result = pixapi::get_pix_forecasts("https://ep-d5adc7ba-0218-44f2-9ed9-eb6d202784e8.serving.aiaengine.com/invocations", 14) -setDT(result) -a4 = rbind(a3, result[,.(date = as.Date(date), V2)]) -plot(a4) - -a4 %>% - full_join(a1) -> a5 - -write.csv(a5, "a5.csv") - - -rs = cran.stats::read_logs(start = as.Date("2019-09-26"), as.Date("2019-09-30"), dir = "cran-logs") - -library(drake) -library(disk.frame) -setup_disk.frame() - - - -plan <- drake_plan( - lf1 = list.files( - file_in("d:/data/cran-logs"), - pattern="*.csv", - full.names = T - ), - x = target( - disk.frame::csv_to_disk.frame( - lf1, - outdir = drake_tempfile()), - format = "diskframe" - ) -) - -make(plan) - -config <- drake_config(plan) -vis_drake_graph(config) - -x = readd(lf1) - -head(x) - -readd(x) -system.time( - df <- csv_to_disk.frame(list.files("d:/data/cran-logs", pattern="*.csv", full.names = T)) -) - -head(df) - -df = shard(df, shardby = "date") -df[package=="disk.frame",.N, date, keep=c("package","date")][order(date)] diff --git a/tutorials/fannie-mae-to-parquet/0-setup.r b/tutorials/fannie-mae-to-parquet/0-setup.r deleted file mode 100644 index dc5fd19..0000000 --- a/tutorials/fannie-mae-to-parquet/0-setup.r +++ /dev/null @@ -1,15 +0,0 @@ -library(disk.frame) -setup_disk.frame() - - -fm.df = disk.frame("c:/data/fannie_mae_disk_frame/fm.df/") - -fm.df %>% - mutate( - yr = substr(monthly.rpt.prd, 7, 11) %>% as.numeric, - mth = substr(monthly.rpt.prd, 1, 2) %>% as.numeric) %>% - rechunk(nchunks = nchunks(fm.df), outdir = "c:/data/fannie_mae_disk_frame/fm_by_yr_mth.df/", shardby = c("yr", "mth")) - - - - diff --git a/tutorials/fannie-mae-to-parquet/1-testing.r b/tutorials/fannie-mae-to-parquet/1-testing.r deleted file mode 100644 index 9900822..0000000 --- a/tutorials/fannie-mae-to-parquet/1-testing.r +++ /dev/null @@ -1,7 +0,0 @@ -library(disk.frame) -setup_disk.frame() - -df1 = disk.frame("c:/data/fannie_mae_disk_frame/fm_by_yr_mth.df/") -system.time(a <- df1[,.N, .(yr, mth), keep=c("yr", "mth")]) - -a diff --git a/tutorials/fannie_mae/00_setup.r b/tutorials/fannie_mae/00_setup.r deleted file mode 100644 index 1fb5ffc..0000000 --- a/tutorials/fannie_mae/00_setup.r +++ /dev/null @@ -1,66 +0,0 @@ -library(glue) -library(purrr) -library(fst) -library(tidyr) -library(ggplot2) -library(stringr) -library(xgboost) -library(lubridate) -library(future.apply) -library(data.table) -library(disk.frame) - -setup_disk.frame() - - -raw_harp_data_path = "c:/data/fannie_mae/harp_files" -#raw_harp_data_path = "D:/data/fannie_mae/harp_files" - -#raw_perf_data_path = "c:/data/Performance_All/" - -## read the split data set is ALOT (6x) faster -raw_perf_data_path = "c:/data/Performance_All_split/" -#raw_perf_data_path = "d:/data/Performance_All/" - - -raw_harp_data_path = "c:/data/fannie_mae/harp_files" -#raw_harp_data_path = "D:/data/fannie_mae/harp_files" - -# level of compression from 1 to 100 where 100 is the highest level -compress = 50 - -# where the outputs go -#outpath = "c:/data/fannie_mae_disk_frame/" -outpath = "c:/data/fannie_mae_disk_frame/" - -#acqzip_file_path = "c:/data/Acquisition_All.zip" -acqzip_file_path = "c:/data/fannie_mae/Acquisition_All.zip" - -#appl_path = "C:/data/" -appl_path = "c:/data" - -Performance_ColClasses = - c("character", "character", "character", "numeric", "numeric", "numeric", "numeric", - "numeric", "character", "character", "character", "character", "character", "character", - "character", "character", "character", "numeric", "numeric", "numeric", "numeric", "numeric", - "numeric", "numeric", "numeric", "numeric", "numeric", "numeric", "character", "numeric", - "character") -Performance_Variables = - c("LOAN_ID", "Monthly.Rpt.Prd", "Servicer.Name", "LAST_RT", "LAST_UPB", "Loan.Age", - "Months.To.Legal.Mat" , "Adj.Month.To.Mat", "Maturity.Date", "MSA", "Delq.Status", - "MOD_FLAG", "Zero.Bal.Code", "ZB_DTE", "LPI_DTE", "FCC_DTE","DISP_DT", "FCC_COST", - "PP_COST", "AR_COST", "IE_COST", "TAX_COST", "NS_PROCS", "CE_PROCS", "RMW_PROCS", - "O_PROCS", "NON_INT_UPB", "PRIN_FORG_UPB_FHFA", "REPCH_FLAG", "PRIN_FORG_UPB_OTH", - "TRANSFER_FLG") %>% tolower - -dfiles = dir(raw_perf_data_path, full.names = T) -short_dfiles = dir(raw_perf_data_path) - -Acquisitions_Variables = c("LOAN_ID", "ORIG_CHN", "Seller.Name", "ORIG_RT", "ORIG_AMT", "ORIG_TRM", "ORIG_DTE" - ,"FRST_DTE", "OLTV", "OCLTV", "NUM_BO", "DTI", "CSCORE_B", "FTHB_FLG", "PURPOSE", "PROP_TYP" - ,"NUM_UNIT", "OCC_STAT", "STATE", "ZIP_3", "MI_PCT", "Product.Type", "CSCORE_C", "MI_TYPE", "RELOCATION_FLG") %>% tolower() - -Acquisition_ColClasses = c("character", "character", "character", "numeric", "numeric", "integer", "character", "character", "numeric", - "numeric", "character", "numeric", "numeric", "character", "character", "character", "character", "character", - "character", "character", "numeric", "character", "numeric", "numeric", "character") - diff --git a/tutorials/fannie_mae/01a_read_from_csv.r b/tutorials/fannie_mae/01a_read_from_csv.r deleted file mode 100644 index e97b7c6..0000000 --- a/tutorials/fannie_mae/01a_read_from_csv.r +++ /dev/null @@ -1,40 +0,0 @@ -source("tutorials/fannie_mae/00_setup.r") - -# number of rows to read in from each file in one go -nreadin = NULL - -# your ram size in G -# this is needed as memomry.limit seems broken -ram.size = 64 -nc = parallel::detectCores(logical = FALSE) -conservatism = 2 -minchunks = nc -# compression ratio, max = 100 for best compression but slower running speed -# compress = 100 - -# set up some variable for future use - -full_file_path = dir(raw_perf_data_path, full.names = T) - - -# randomise it to maximize the chance of a good load balancing -#set.seed(1) -#full_file_path = sample(full_file_path, length(full_file_path)) - -# convert CSV in parallel -pt <- proc.time() -tot_file_size = sum(file.size(full_file_path))/1024^3 -res = csv_to_disk.frame( - full_file_path[1:6], - outdir = file.path(outpath, "raw_fannie_mae"), - #shardby = "loan_id", - colClasses = Performance_ColClasses, - col.names = Performance_Variables, - sep = "|", - in_chunk_size = nreadin, - nchunks = max(round(tot_file_size/ram.size*nc)*nc*conservatism, nc), - overwrite = TRUE, - .progress = TRUE # show progress -) -print(data.table::timetaken(pt)) - diff --git a/tutorials/fannie_mae/01a_read_from_csv_SINGLE_THREADED.r b/tutorials/fannie_mae/01a_read_from_csv_SINGLE_THREADED.r deleted file mode 100644 index be535fe..0000000 --- a/tutorials/fannie_mae/01a_read_from_csv_SINGLE_THREADED.r +++ /dev/null @@ -1,37 +0,0 @@ -source("inst/fannie_mae/0_setup.r") - -# number of rows to read in -compress = 50 - -# how many chunks do we need? -relative_file_path = dir(raw_perf_data_path) -full_file_path = dir(raw_perf_data_path, full.names = T) - -file_sizes = purrr::map_dbl(full_file_path, ~file.size(.x)) - -# use the recommend_nchunks function to get a chunksize based on your RAM and -# number of CPU cores -nchunks = sum(file_sizes) %>% recommend_nchunks(type="csv") - -relative_file_path = relative_file_path[order(file_sizes, decreasing = T)] -full_file_path = full_file_path[order(file_sizes, decreasing = T)] - -l = length(full_file_path) - -pt <- proc.time() -system.time(lapply(1:l, function(i) { -#system.time(future_lapply(1:6, function(i) { - relative_file_pathi = relative_file_path[i] - full_file_path - csv_to_disk.frame( - full_file_path[i], - glue("test_fm/{relative_file_path[i]}"), - shardby="loan_id", - nchunks=nchunks, - colClasses = Performance_ColClasses, - col.names = Performance_Variables, - sep="|", - compress=compress, - overwrite = T) -})) -print(timetaken(pt)) \ No newline at end of file diff --git a/tutorials/fannie_mae/01d_OPTIONAL_rechunk.r b/tutorials/fannie_mae/01d_OPTIONAL_rechunk.r deleted file mode 100644 index 1ee3669..0000000 --- a/tutorials/fannie_mae/01d_OPTIONAL_rechunk.r +++ /dev/null @@ -1,5 +0,0 @@ -library(disk.frame) - -fmdf <- disk.frame("fmdf") - -rechunk(fmdf, 128) diff --git a/tutorials/fannie_mae/01e_read_data_for_harp.r b/tutorials/fannie_mae/01e_read_data_for_harp.r deleted file mode 100644 index b9f1173..0000000 --- a/tutorials/fannie_mae/01e_read_data_for_harp.r +++ /dev/null @@ -1,85 +0,0 @@ -#1c2 -source("tutorials/fannie_mae/00_setup.r") - -pt <- proc.time() - -# rows to read in one go -rows_to_read = 1e7 - -# load the Fannie Mae disk.frame -fmdf <- disk.frame(file.path(outpath, "fm.df")) - -#system.time(harp <- fread("C:/data/HARP_Files/Performance_HARP.txt", colClasses = Performance_ColClasses, col.names = Performance_Variables)) -harp_mapping <- fread(file.path(raw_harp_data_path,"Loan_Mapping.txt"), colClasses = "c", col.names = c("loan_id", "harp_id")) -setkey(harp_mapping, harp_id) -fst::write_fst(harp_mapping,"harp_mapping.fst.tmp") -fs::file_move("harp_mapping.fst.tmp", "harp_mapping.fst") -print("reading in and saving HARP mapping file took: ") -print(data.table::timetaken(pt)) - -# took about 438.65 on laptop 500 chunks -# took about 205 on desktop 56 chunks -pt <- proc.time() -system.time( - harp <- csv_to_disk.frame(file.path(raw_harp_data_path,"Performance_HARP.txt"), inmapfn = function(df) { - setnames(df, "loan_id", "harp_id") - - merge(df, harp_mapping, by="harp_id") - }, - nchunks = nchunks(fmdf), - in_chunk_size = rows_to_read, - shardby = "loan_id", - outdir = file.path(outpath, "harp.df"), - colClasses = Performance_ColClasses, - col.names = Performance_Variables, - sep="|")) - -print("reading in and saving HARP mapping file took: ") -print(data.table::timetaken(pt)) - - -if(F) { - # it can be seen that some accounts can start n harp the month the same that it ends in the dataset - harp1 = get_chunk.disk.frame(harp,1) - fmdf1 = get_chunk.disk.frame(fmdf,1) - - harp1[,date:=as.Date(monthly.rpt.prd,"%m/%d/%Y")] - fmdf1[,date:=as.Date(monthly.rpt.prd,"%m/%d/%Y")] - - fmdf2 = fmdf1[loan_id %in% unique(harp1$loan_id)] - - fmdf3 = merge( - fmdf2[,.(max_date = max(date)),loan_id], - harp1[,.(min_date = min(date)),loan_id], - by = "loan_id" - ) - - fmdf3[max_date == min_date,] - - fmdf3[max_date > min_date,] - fmdf3[max_date <= min_date,] - - harp4 = harp[,.N, delq.status, keep="delq.status"] -} - - -# print(nrow(harp)) -# system.time(harp <- merge(harp, harp_mapping, by="harp_id")) -# print(nrow(harp)) -# setkey(harp,"loan_id") -# system.time(fst::write_fst(harp,"harp.fst")) # 16.63 -#system.time(fst::write_fst(harp,"harp100.fst",100)) # 391 seconds - -# check if the harp_id -> loan_id is successful -if(F) { - harp_acq <- fread("C:/data/HARP_Files/Acquisition_HARP.txt") - fst::write_fst(harp_mapping,"harp_mapping.fst") - - harp[,date:=as.Date(month,"%m/%d/%Y")] - harp[,.N,date][order(date)] - fmdf = disk.frame("fmdf") - system.time(uid <- fmdf[,.(loan_id = unique(loan_id)), keep = "loan_id"][,unique(loan_id)]) - harp_uid = unique(harp_mapped$loan_id) - def = intersect(uid, harp_uid) - def -} \ No newline at end of file diff --git a/tutorials/fannie_mae/01f_append_harp_to_data_DEPRECATED.r b/tutorials/fannie_mae/01f_append_harp_to_data_DEPRECATED.r deleted file mode 100644 index 3d344ac..0000000 --- a/tutorials/fannie_mae/01f_append_harp_to_data_DEPRECATED.r +++ /dev/null @@ -1,40 +0,0 @@ -#2d_harp_check.r -source("inst/fannie_mae/0_setup.r") - -# system.time(harp <- fread("D:/data/fannie_mae/HARP_Files/Performance_HARP.txt", select = c("V1","V2"), colClasses = "c")) -# setnames(harp, names(harp), c("harp_id","month")) -# -# harp_mapping <- fread("D:/data/fannie_mae/HARP_Files/Loan_Mapping.txt", colClasses = "c", col.names = c("loan_id", "harp_id")) -# fst::write_fst(harp_mapping,"harp_mapping.fst") -# -# harp_mapped = merge(harp, harp_mapping, by="harp_id") -# c(nrow(harp), nrow(harp_mapped)) -# -# harp_acq <- fread("d:/data/fannie_mae/HARP_Files/Acquisition_HARP.txt") -# fst::write_fst(harp_mapping,"harp_mapping.fst") -# -# harp[,date:=as.Date(month,"%m/%d/%Y")] -# harp[,.N,date][order(date)] - -system.time(harp <- fst::read_fst("harp.fst")) - -fmdf = disk.frame("fmdf") - -if(F) { - # this was run once to confimr that the -} -system.time(uid <- fmdf[,.(loan_id = unique(loan_id)), keep = "loan_id"][,unique(loan_id)]) -harp_uid = unique(harp_mapped$loan_id) - -def = intersect(uid, harp_uid) -def - -system.time(harp.df <- shard(harp,"loan_id",500, "harp.df", overwrite = T, append = F)) - -system.time(fmdfh <- rbindlist.disk.frame(list(fmdf, harp.df), outdir = "fmdf1_w_harp")) - -# nothing in the intersection -# so either HARP ids were changed once there are in the harp program - - - diff --git a/tutorials/fannie_mae/01g_compress_it_DEPRECATED.r b/tutorials/fannie_mae/01g_compress_it_DEPRECATED.r deleted file mode 100644 index 937bd87..0000000 --- a/tutorials/fannie_mae/01g_compress_it_DEPRECATED.r +++ /dev/null @@ -1,10 +0,0 @@ -source("inst/fannie_mae/00_setup.r") - -# create a most compressed set -a = disk.frame("fmdf") -chunk_lapply(a,function(x) x, outdir="fmdf0", chunks=500, compress=100) - -a1 = disk.frame("fmdf0") - -system.time(print(a[,.N,keep="delq.status"] %>% sum)) -system.time(print(a1[,.N,keep="delq.status"] %>% sum)) diff --git a/tutorials/fannie_mae/02_exploratory.r b/tutorials/fannie_mae/02_exploratory.r deleted file mode 100644 index 2a362d8..0000000 --- a/tutorials/fannie_mae/02_exploratory.r +++ /dev/null @@ -1,19 +0,0 @@ -# 2_exploratory.r -source("inst/fannie_mae/0_setup.r") - -fmdf = disk.frame("fmdf0") - -system.time(a <- fmdf[,.N,delq.status,keep="delq.status"][,.(N=sum(N)),delq.status]) -setkey(a,delq.status); a - -str(a) - -str(head(fmdf)) - -b = fmdf[,.N,monthly.rpt.prd,keep="monthly.rpt.prd"][,sum(N),monthly.rpt.prd]; b - - -names(fmdf) -servicer.name_cnt = fmdf[,.N,servicer.name,keep=c("servicer.name")] - -servicer.name_cnt[,sum(N),servicer.name] diff --git a/tutorials/fannie_mae/02a_create_forward_looking_flag.r b/tutorials/fannie_mae/02a_create_forward_looking_flag.r deleted file mode 100644 index 2d7dc6b..0000000 --- a/tutorials/fannie_mae/02a_create_forward_looking_flag.r +++ /dev/null @@ -1,97 +0,0 @@ -# 2_exploratory.r -source("inst/fannie_mae/0_setup.r") - -fmdf = disk.frame(file.path(outpath, "fm.df")) -harp = disk.frame(file.path(outpath, "harp.df")) - -# harp contains the first date on which a loan enters into HARP -pt <- proc.time() -harp1 <- harp %>% - srckeep(c("loan_id","monthly.rpt.prd")) %>% - group_by(loan_id, hard = F) %>% # the data is sharded by loan_id hence hard = F is fine - summarise(first_harp_date = min(as.Date(monthly.rpt.prd, "%m/%d/%Y"))) %>% - collect(parallel = T) # performs the collection in parallel - -cat(glue("Creating first harp date took: {timetaken(pt)}")) - -if(F) { - # data.table syntax - harp[,.(first_harp_date = min(as.Date(monthly.rpt.prd, "%m/%d/%Y"))), loan_id, - keep=c("loan_id", "monthly.rpt.prd")] -} - -harp1[,before_12m_first_harp_date:=first_harp_date] -# 164 seconds -system.time(lubridate::month(harp1$before_12m_first_harp_date) <- lubridate::month(harp1$before_12m_first_harp_date) - 12) -harp1[,harp_12m := TRUE] - -# break it out into smaller chunks for even faster merging -# took 27 -pt <- proc.time() -harp2 <- shard( - harp1, - "loan_id", - nchunks = nchunks(fmdf), - outdir = file.path(outpath, "first_harp_date.df"), - overwrite=T) -cat(glue::glue("sharding HARP defaults took {timetaken(pt)}")) - -# no need to hard group by it's already sharded by loan_id -# took about 503 seconds -# took about 11:52 -pt <- proc.time() -defaults <- cmap(fmdf, function(df) { - # create the default flag - df[,date:=as.Date(monthly.rpt.prd,"%m/%d/%Y")] - setkey(df,loan_id, date) - df[,delq.status := as.integer(delq.status)] - df[,default:=delq.status>=3] - df[is.na(default),default:=F] - - df2 = df[default==T,.(loan_id, start_date =date, end_date = date)] - lubridate::month(df2$start_date) <- lubridate::month(df2$start_date)-12 - - # get rid of overlaps - df2[order(start_date),lag_end_date := shift(end_date,1), loan_id] - - df2[,grp_incr := 1] - df2[start_date <= lag_end_date, grp_incr := 0, loan_id] - - df2[,grp:=cumsum(grp_incr)] - df3 = df2[,.(start_date = min(start_date), end_date = max(end_date), default_12m=T),.(loan_id,grp)] - - df3[,grp:=NULL] - - # create the hardship flag - # whether the customer goes into in the next 12 months hardship - df3 - }, - keep = c("monthly.rpt.prd", "delq.status", "loan_id"), - outdir = file.path(outpath, "defaults.df"), - lazy = F, - overwrite = T) -cat(glue::glue("Creating forward looking default flag took {timetaken(pt)}")) - -# -# a = df[loan_id == "100513171914", ] -# -# a[,monthly.rpt.prd := as.Date(monthly.rpt.prd,"%m/%d/%Y")] -# a[,monthly.rpt.prd2:=monthly.rpt.prd] -# -# -# setkey(df3,"loan_id","start_date","end_date") -# -# a1= foverlaps(a, df3, by.x = c("loan_id","monthly.rpt.prd","monthly.rpt.prd2"), -# by.y=c("loan_id","start_date","end_date")) -# -# a1[,.(loan_id,monthly.rpt.prd,start_date, end_date)] - -# create a list that show defaults ---------------------------------------- -# system.time(defaults <- chunk_lapply(fmdf, function(df) { -# df[,date:=as.Date(monthly.rpt.prd,"%m/%d/%Y")] -# df[,delq.status := as.integer(delq.status)] -# df[,default:=delq.status>=3] -# df[is.na(default),default:=F] -# df1 = df[default == F] -# df1 -# }, outdir="tmp1")) diff --git a/tutorials/fannie_mae/02b_foverlaps.r b/tutorials/fannie_mae/02b_foverlaps.r deleted file mode 100644 index a89d107..0000000 --- a/tutorials/fannie_mae/02b_foverlaps.r +++ /dev/null @@ -1,45 +0,0 @@ -# 2_exploratory.r -source("inst/fannie_mae/0_setup.r") - -fmdf = disk.frame(file.path(outpath, "fm.df")) -tmp2 = disk.frame(file.path(outpath, "defaults.df")) -harp2 = disk.frame(file.path("first_harp_date.df")) - -fmdf_lazy = delayed(fmdf, function(df) { - df[,monthly.rpt.prd:=as.Date(monthly.rpt.prd,"%m/%d/%Y")] - df[,monthly.rpt.prd2:=monthly.rpt.prd] -}) - -tmp2_lazy = delayed(tmp2, function(df) { - setkey(df, "loan_id","start_date","end_date") -}) - -harp2_lazy = delayed(harp2, function(df) { - setkey(df, "loan_id","before_12m_first_harp_date","first_harp_date") -}) - -# took 513 seconds -pt <- proc.time() -tmp3 <- foverlaps.disk.frame( - fmdf_lazy, - tmp2_lazy, - outdir=file.path(outpath, "fm_with_default"), - by.x = c("loan_id", "monthly.rpt.prd", "monthly.rpt.prd2"), - by.y = c("loan_id", "start_date", "end_date"), - merge_by_chunk_id = T, - overwrite = T -) -cat(glue::glue("time taken to merge on default flag {timetaken(pt)}")) - -# took 6:55 -pt <- proc.time() -tmp4 <- foverlaps.disk.frame( - tmp3, - harp2_lazy, - outdir=file.path(outpath, "fm_with_harp"), - by.x = c("loan_id", "monthly.rpt.prd", "monthly.rpt.prd2"), - by.y = c("loan_id", "before_12m_first_harp_date", "first_harp_date"), - merge_by_chunk_id = T, - overwrite = T -) -cat(glue::glue("time taken to merge on HARP flag {timetaken(pt)}")) \ No newline at end of file diff --git a/tutorials/fannie_mae/02b_foverlaps_Reduced16.r b/tutorials/fannie_mae/02b_foverlaps_Reduced16.r deleted file mode 100644 index 825d28d..0000000 --- a/tutorials/fannie_mae/02b_foverlaps_Reduced16.r +++ /dev/null @@ -1,68 +0,0 @@ -# 2_exploratory.r -source("inst/fannie_mae/0_setup.r") -pt <- proc.time() - -fmdf = disk.frame("only16.df") -tmp2 = disk.frame("defaults.df") # tmp2 contains all the defaults -harp2 = disk.frame("first_harp_date.df") - -fmdf_lazy = delayed(fmdf, function(df) { - df[,monthly.rpt.prd:=as.Date(monthly.rpt.prd,"%m/%d/%Y")] - df[,monthly.rpt.prd2:=monthly.rpt.prd] -}) - -tmp2_lazy = delayed(tmp2, function(df) { - setkey(df, "loan_id","start_date","end_date") -}) - -harp2_lazy = lazy(harp2, function(df) { - setkey(df, "loan_id","before_12m_first_harp_date","first_harp_date") -}) - -#dh <- rbindlist.disk.frame(tmp2_lazy, harp2_lazy, outdir ="defaults_harp.df", by_chunk_id = T) -#plan(transparent) -system.time(tmp3 <- foverlaps.disk.frame( - fmdf_lazy, - tmp2_lazy, - outdir="tmp3_16", - by.x = c("loan_id", "monthly.rpt.prd", "monthly.rpt.prd2"), - by.y = c("loan_id", "start_date", "end_date"), - merge_by_chunk_id = T, - compress = 50 -)) - - -system.time(tmp4 <- foverlaps.disk.frame( - tmp3, - harp2_lazy, - outdir="tmp4_16", - by.x = c("loan_id", "monthly.rpt.prd", "monthly.rpt.prd2"), - by.y = c("loan_id", "before_12m_first_harp_date", "first_harp_date"), - merge_by_chunk_id = T, - compress = 50 -)) - -data.table::timetaken(pt) - - -tmp4 <- disk.frame("tmp4_16") -system.time(tmp5 <- tmp4[,.(ndef=sum(default_12m, na.rm=T),.N), .(monthly.rpt.prd, delq.status), - keep=c("default_12m","monthly.rpt.prd","delq.status")]) - - -tmp5[, delq.status.cap := pmin(5, as.integer(delq.status))] -tmp5[delq.status == "X", delq.status.cap := 0] -tmp5[delq.status == "", delq.status.cap := 0] - - -tmp6 <- tmp5[,.(ndef=sum(ndef), N=sum(N)),.(monthly.rpt.prd, delq.status.cap)] - - -tmp6[,defr:=ndef/N] - -library(ggplot2) -tmp6 %>% - ggplot + - geom_line(aes(x=monthly.rpt.prd, y = defr, colour=as.factor(delq.status.cap))) - - diff --git a/tutorials/fannie_mae/02c_plot_odr.r b/tutorials/fannie_mae/02c_plot_odr.r deleted file mode 100644 index 4521511..0000000 --- a/tutorials/fannie_mae/02c_plot_odr.r +++ /dev/null @@ -1,45 +0,0 @@ -# 2_exploratory.r -source("inst/fannie_mae/0_setup.r") - -fm_with_harp = disk.frame(file.path(outpath, "fm_with_harp")) - -head(fm_with_harp) - -# need a two stage summary -system.time(a_wh1 <- fm_with_harp %>% - srckeep(c("default_12m","harp_12m","monthly.rpt.prd")) %>% - group_by(monthly.rpt.prd, hard = F) %>% - summarise( - N = n(), - n_defaults = sum(default_12m, na.rm = T), - n_goto_harp = sum(harp_12m, na.rm=T)) %>% - collect(parallel = T) %>% - group_by(monthly.rpt.prd) %>% - summarise( - odr = sum(n_defaults)/sum(N), - oh = sum(n_goto_harp)/sum(N) - ) %>% - rename( - Date = monthly.rpt.prd, - `Observed Default Rate%` = odr, - `HARP Conversion Rate%` = oh - )) - - -if(F) { - # data.table syntax - system.time(a_wh <- tmp4[ - ,.(.N, n_defaults = sum(default_12m, na.rm = T), ) - ,.(monthly.rpt.prd), keep=c("default_12m","harp_12m","monthly.rpt.prd")]) - - a_wh1 = a_wh[,.(odr = sum(n_defaults)/sum(N), oh = sum(n_goto_harp)/sum(N)), monthly.rpt.prd] - setnames(a_wh1, c("monthly.rpt.prd","odr", "oh"), c("Date", "Observed Default Rate%", "HARP Conversion Rate%")) -} - -a_wh2 = a_wh1 %>% gather(key = type, value=rate, -Date) - -ggplot(a_wh2) + - geom_line(aes(x=Date, y = rate, colour = type)) + - ggtitle("Fannie Mae Observed Default Rate over time & HARP Conversion Rate") - - diff --git a/tutorials/fannie_mae/02d_harp_check.r b/tutorials/fannie_mae/02d_harp_check.r deleted file mode 100644 index f9fd020..0000000 --- a/tutorials/fannie_mae/02d_harp_check.r +++ /dev/null @@ -1,33 +0,0 @@ -#2d_harp_check.r -source("inst/fannie_mae/0_setup.r") - -system.time(harp <- fread("D:/data/fannie_mae/HARP_Files/Performance_HARP.txt", select = c("V1","V2"), colClasses = "c")) -setnames(harp,names(harp), c("harp_id","month")) - -harp_mapping <- fread("D:/data/fannie_mae/HARP_Files/Loan_Mapping.txt", colClasses = "c", col.names = c("loan_id", "harp_id")) -fst::write_fst(harp_mapping,"harp_mapping.fst") - -harp_mapped = merge(harp, harp_mapping, by="harp_id") -c(nrow(harp), nrow(harp_mapped)) - -harp_acq <- fread("d:/data/fannie_mae/HARP_Files/Acquisition_HARP.txt") -fst::write_fst(harp_mapping,"harp_mapping.fst") - -harp[,date:=as.Date(month,"%m/%d/%Y")] -harp[,.N,date][order(date)] - -fmdf = disk.frame("fmdf") - -system.time(uid <- fmdf[,.(loan_id = unique(loan_id)), keep = "loan_id"][,unique(loan_id)]) - -harp_uid = unique(harp_mapped$loan_id) - -def = intersect(uid, harp_uid) -def - -system.time(harp.df <- shard(harp,"loan_id", nchunks = nchunks(fmdf), outdir = "harp.df", overwrite = T)) - -system.time(fmdfh <- rbindlist.disk.frame(list(fmdf, harp.df), outdir = "fmdf1_w_harp")) - -# nothing in the intersection -# so either HARP ids were changed once there are in the harp program \ No newline at end of file diff --git a/tutorials/fannie_mae/10_a_one_var.R b/tutorials/fannie_mae/10_a_one_var.R deleted file mode 100644 index fae7540..0000000 --- a/tutorials/fannie_mae/10_a_one_var.R +++ /dev/null @@ -1,191 +0,0 @@ -source("inst/fannie_mae/0_setup.r") -library(disk.frame) - -acqall1 = disk.frame(file.path(outpath, "appl_mdl_data_sampled_dev2")) - -# XGBoost on one var ------------------------------------------------------ -system.time(xy <- acqall1[,c("default_next_12m", "oltv"), keep=c("default_next_12m", "oltv")]) - -# transform into format accepted by XGBoost -dtrain <- xgb.DMatrix(label = xy$default_next_12m, data = as.matrix(xy[,"oltv"])) - -portfolio_default_rate = xy[,sum(default_next_12m, na.rm = T)/.N] - -pt = proc.time() -m <- xgboost( - data=dtrain, - nrounds = 1, - objective = "binary:logitraw", - tree_method="exact", - monotone_constraints = 1, - base_score = portfolio_default_rate) -timetaken(pt) - -prev_pred = predict(m, dtrain) - -map_chr(xgb.dump(m), ~str_extract(.x,"\\[f0<[\\d]+\\.[\\d]+\\]")) %>% - keep(~!is.na(.x)) %>% - map_dbl(~str_extract(.x, "[\\d]+\\.[\\d]+") %>% as.numeric) %>% - sort %>% - floor -> bins - -bb = xy[,.(ndef = sum(default_next_12m), .N, m1 = min(oltv), m2 = max(oltv)), .(bins = cut(oltv,c(-Inf,bins,Inf)))] -new_bins = sort(unique( bb$m2)) -bb = xy[,.(ndef = sum(default_next_12m), .N), .(bins = cut(oltv,c(-Inf,new_bins,Inf)))][order(bins)] - -setkey(bb, bins) -bb %>% - filter(!is.na(bins)) %>% - mutate(`Orig LTV Band` = bins, `Default Rate%` = ndef/N) %>% - ggplot + - geom_bar(aes(x = `Orig LTV Band`, y = `Default Rate%`), stat = 'identity') + - coord_flip() - -if(F) { - system.time(xyz <- acqall1[,c("default_next_12m", "oltv", "frst_dte"), keep=c("default_next_12m", "oltv", "frst_dte")]) - - xyz[,frst_yr := substr(frst_dte,4,7) %>% as.integer] - xyz[,frst_dte:=NULL] - - bb = xyz[,.(ndef = sum(default_next_12m), .N), .(bins = cut(oltv,c(-Inf,bins,Inf)), frst_yr)] - - bb[,binton := sum(N), bins] - bb[,dr := ndef/binton] - - setkey(bb, bins) - bb[order(frst_yr,decreasing = T),text_y_pos := cumsum(dr) - dr/2, bins] - bb[,text := substr(frst_yr, 3,4)] - - bb %>% - filter(!is.na(bins)) %>% - mutate(`Yr of Orig` = as.factor(frst_yr), `Orig LTV Band` = bins, `Default Rate%` = dr) %>% - ggplot + - geom_bar(aes(x = `Orig LTV Band`, y = `Default Rate%`, fill = `Yr of Orig`), stat = 'identity') + - geom_text(aes(x = `Orig LTV Band`, y = text_y_pos, label = text)) + - coord_flip() -} - - -# what if we fit them in rounded number --------------------------------------------------------- - -xy[oltv > 80, oltv_round := ceiling(oltv/5)*5] -xy[oltv <= 80, oltv_round := ceiling(oltv/10)*10] -xy[oltv <= 40, oltv_round := ceiling(oltv/20)*20] -#xy = xy[!is.na(oltv_round),] - -xy[is.na(oltv_round),] - -xy[,.N, oltv_round] - -# transform into format accepted by XGBoost -dtrain <- xgb.DMatrix(label = xy$default_next_12m, data = as.matrix(xy[,"oltv_round"])) - -pt = proc.time() -m <- xgboost( - data=dtrain, - nrounds = 1, - objective = "binary:logitraw", - tree_method="exact", - monotone_constraints = 1, - base_score = portfolio_default_rate) -timetaken(pt) - -map_chr(xgb.dump(m), ~str_extract(.x,"\\[f0<[\\d]+[\\.]{0,1}[\\d]+\\]")) %>% - keep(~!is.na(.x)) %>% - map_dbl(~str_extract(.x, "[\\d]+[\\.]{0,1}[\\d]+") %>% as.numeric) %>% - sort -> bins - -bb = xy[,.(ndef = sum(default_next_12m), .N, m1 = min(oltv_round), m2 = max(oltv_round)), .(bins = cut(oltv_round,c(-Inf,bins,Inf)))] -new_bins = sort(unique( bb$m2)) -bb = xy[,.(ndef = sum(default_next_12m), .N), .(bins = cut(oltv_round,c(-Inf,new_bins,Inf)))][order(bins)] -bb[,odr := ndef/N] - -setkey(bb, bins) -bb %>% - filter(!is.na(bins)) %>% - mutate(`Orig LTV Band` = bins, `Default Rate%` = ndef/N) %>% - ggplot + - geom_bar(aes(x = `Orig LTV Band`, y = `Default Rate%`), stat = 'identity') + - coord_flip() - - -prev_pred = predict(m, dtrain) - -prev_pred1 = predict(m, dtrain, predcontrib=T) - -# add another variable ---------------------------------------------------- -target = "default_next_12m" -feature = "orig_amt" -df = acqall1 -format_fn = base::I -existing_model = prev_pred -monotone_constraints = -1 - -# auc <- function(target, score) { -# df = data.table(target, score) -# -# df1 = df[,.(nt = sum(target), n = .N, score)] -# setkey(df1, score) -# } -# -# add_var_to_scorecard <- function(df, target, feature, monotone_constraints = 0, prev_pred = NULL, format_fn = base::I) { -# -# xy = df %>% -# srckeep(c(target, feature)) %>% -# collect(parallel = T) -# -# # evaluate -# code = glue::glue("xy = xy %>% mutate({feature} = format_fn({feature}))") -# eval(parse(text = code)) -# -# dtrain <- xgb.DMatrix(label = xy[,target, with = F][[1]], data = as.matrix(xy[,c(feature), with = F])) -# -# if(is.null(prev_pred)) { -# pt = proc.time() -# m2 <- xgboost( -# data=dtrain, -# nrounds = 1, -# objective = "binary:logitraw", -# tree_method="exact", -# monotone_constraints = monotone_constraints -# ) -# timetaken(pt) -# } else { -# setinfo(dtrain, "base_margin", prev_pred) -# pt = proc.time() -# m2 <- xgboost( -# data=dtrain, -# nrounds = 1, -# objective = "binary:logitraw", -# tree_method="exact", -# monotone_constraints = monotone_constraints -# ) -# timetaken(pt) -# -# a2 = predict(m2, dtrain) -# a3 = predict(m2, dtrain, predcontrib = T) -# } -# -# map_chr(xgb.dump(m2), ~str_extract(.x,"\\[f0<[\\d]+[\\.]{0,1}[\\d]+\\]")) %>% -# keep(~!is.na(.x)) %>% -# map_dbl(~str_extract(.x, "[\\d]+[\\.]{0,1}[\\d]+") %>% as.numeric) %>% -# sort -> bins -# -# code = glue::glue("bb = xy[,.(ndef = sum({target}), .N, m1 = min({feature}), m2 = max({feature})), .(bins = cut({feature},c(-Inf,bins,Inf)))]") -# eval(parse(text = code)) -# -# new_bins = sort(unique(bb$m2)) -# code1 = glue::glue("bb = xy[,.(ndef = sum(default_next_12m), .N), .(bins = cut({feature},c(-Inf,new_bins,Inf)))][order(bins)]") -# eval(parse(text = code1)) -# -# setkey(bb, bins) -# bb %>% -# filter(!is.na(bins)) %>% -# mutate(`Orig LTV Band` = bins, `Default Rate%` = ndef/N) %>% -# ggplot + -# geom_bar(aes(x = `Orig LTV Band`, y = `Default Rate%`), stat = 'identity') + -# coord_flip() -# -# -# } - diff --git a/tutorials/fannie_mae/10_a_one_var_with_fn.R b/tutorials/fannie_mae/10_a_one_var_with_fn.R deleted file mode 100644 index 687abcc..0000000 --- a/tutorials/fannie_mae/10_a_one_var_with_fn.R +++ /dev/null @@ -1,56 +0,0 @@ -source("inst/fannie_mae/0_setup.r") -library(disk.frame) - -acqall1 = disk.frame(file.path(outpath, "appl_mdl_data_sampled_dev2")) - -pt <- proc.time() -firstm <- add_var_to_scorecard(acqall1, "default_next_12m", "oltv", monotone_constraints = 1, format_fn = function(v) { - ceiling(v / 5) * 5 -}) -timetaken(pt) - -firstm - -# now add the second variable --------------------------------------------- -pt <- proc.time() -secondm <- add_var_to_scorecard( - acqall1, - "default_next_12m", - "dti", - prev_pred = firstm$prev_pred, - monotone_constraints = 1) -timetaken(pt) - -pt <- proc.time() -thirdm <- add_var_to_scorecard( - acqall1, - "default_next_12m", - "orig_trm", - prev_pred = secondm$prev_pred, - monotone_constraints = 1) -timetaken(pt) - -pt <- proc.time() -fourthm <- add_var_to_scorecard( - acqall1, - "default_next_12m", - "num_bo", - prev_pred = thirdm$prev_pred, - monotone_constraints = -1, - format_fn = as.numeric) -timetaken(pt) - -pt <- proc.time() -fifthm <- add_var_to_scorecard( - acqall1, - "default_next_12m", - "cscore_b", - prev_pred = fourthm$prev_pred, - monotone_constraints = -1) -timetaken(pt) - - -# plot improvement -------------------------------------------------------- -plot(map_dbl(list(firstm, secondm, thirdm, fourthm, fifthm), ~.x$auc), type="b", ylim = c(0.4,0.9)) -abline(h = 0.7, lty = 2) - diff --git a/tutorials/fannie_mae/10_b_two_vars.r b/tutorials/fannie_mae/10_b_two_vars.r deleted file mode 100644 index d0df6cd..0000000 --- a/tutorials/fannie_mae/10_b_two_vars.r +++ /dev/null @@ -1,61 +0,0 @@ -source("inst/fannie_mae/0_setup.r") -library(disk.frame) - -acqall1 = disk.frame(file.path(outpath, "appl_mdl_data_sampled_dev2")) - -pt <- proc.time() -firstm <- add_var_to_scorecard( - acqall1, - "default_next_12m", - "oltv", - monotone_constraints = 1, - format_fn = function(v) { - ceiling(v / 5) * 5 - }) -timetaken(pt) - -firstm - -# now add the second variable --------------------------------------------- -pt <- proc.time() -secondm <- add_var_to_scorecard( - acqall1, - "default_next_12m", - "dti", - prev_pred = firstm$prev_pred, - monotone_constraints = 1) -timetaken(pt) - -pt <- proc.time() -thirdm <- add_var_to_scorecard( - acqall1, - "default_next_12m", - "orig_trm", - prev_pred = secondm$prev_pred, - monotone_constraints = 1) -timetaken(pt) - -pt <- proc.time() -fourthm <- add_var_to_scorecard( - acqall1, - "default_next_12m", - "num_bo", - prev_pred = thirdm$prev_pred, - monotone_constraints = -1, - format_fn = as.numeric) -timetaken(pt) - -pt <- proc.time() -fifthm <- add_var_to_scorecard( - acqall1, - "default_next_12m", - "cscore_b", - prev_pred = fourthm$prev_pred, - monotone_constraints = -1) -timetaken(pt) - - -# plot improvement -------------------------------------------------------- -#plot(map_dbl(list(firstm, secondm, thirdm, fourthm, fifthm), ~.x$auc), type="b", ylim = c(0.4,0.9)) -#abline(h = 0.7, lty = 2) - diff --git a/tutorials/fannie_mae/10a_AutoML_XGBoost_Scorecards.r b/tutorials/fannie_mae/10a_AutoML_XGBoost_Scorecards.r deleted file mode 100644 index bd708ed..0000000 --- a/tutorials/fannie_mae/10a_AutoML_XGBoost_Scorecards.r +++ /dev/null @@ -1,90 +0,0 @@ -source("inst/fannie_mae/0_setup.r") -library(disk.frame) - -acqall_dev = disk.frame(file.path(outpath, "appl_mdl_data_sampled_dev2")) - - -# 5d develop a function to test all variables -check_which_is_best <- function(df, target, features, monotone_constraints, format_fns, weight=NULL) { - - prev_pred = NULL - ws = NULL - vars_scr = NULL - - while(length(features) > 0) { - # try every var - - res = furrr::future_map(1:length(features), ~add_var_to_scorecard( - #res = purrr::map(1:length(features), ~add_var_to_scorecard( - df, - target, - features[.x], - monotone_constraints = monotone_constraints[.x], - prev_pred = prev_pred, - format_fn = format_fns[[.x]], - weight, - save_model_fname = glue::glue("{features[.x]}.xgbm") - )) - - # which has the best auc - w = which.max(map_dbl(res, ~.x$auc)) - ws = c(ws, w) - feature_to_fit = features[w] - print(glue::glue("choosen: {feature_to_fit}")) - - var_scr1 = res[[w]] - eval(parse(text = glue::glue("vars_scr = c(vars_scr, list({feature_to_fit} = var_scr1))"))) - - features = features[-w] - format_fns = format_fns[-w] - monotone_constraints = monotone_constraints[-w] - - prev_pred = var_scr1$prev_pred - } - vars_scr -} - -num_vars = c("mi_pct", "orig_amt", "orig_rt", "ocltv", "mi_type", "cscore_c", "oltv", "dti", "orig_trm", "cscore_b", "num_bo", "num_unit", "orig_dte", "frst_dte") -num_vars_mon = c(0 , 1 , 1 , 1 , 0 , -1 , 1 , 1 , 1 , -1 , -1 , 0 , 0 , 0) -num_var_fmt_fn = c(map(1:10, ~base::I), map(1:2, ~as.numeric), map(1:2, ~function(x) { - as.numeric(substr(x, 4, 7)) -})) - -cat_vars = setdiff(names(acqall_dev), num_vars) %>% - setdiff( - c("loan_id", "default_next_12m", "first_default_date", "mths_to_1st_default", "zip_3", "weight")) - -pt = proc.time() -res_all = check_which_is_best( - acqall_dev, - "default_next_12m", - features = c(num_vars, cat_vars), - monotone_constraints = c(num_vars_mon, rep(0, length(cat_vars))), - format_fns = c(num_var_fmt_fn, map(1:length(cat_vars), ~base::I)), - weight = "weight" -) -timetaken(pt) - -plot(map_dbl(res_all, ~.x$auc)) - -#saveRDS(res_all, "model.rds") - -if(F) { - rescat = check_which_is_best( - acqall_dev, - "default_next_12m", - c("seller.name", "oltv", "state", "dti", "mi_type"), - c(0 , 1 , 0 , 1 , 0 ), - map(1:5, ~base::I) - ) - - - system.time(res <- check_which_is_best( - acqall1, - "default_next_12m", - c("ocltv", "cscore_c", "mi_type", "oltv", "dti", "orig_trm", "cscore_b", "num_bo"), - c(1 , -1 , 0 , 1 , 1 , 1 , -1 , -1 ), - c(map(1:7, ~base::I), list(as.numeric)))) -} - - diff --git a/tutorials/fannie_mae/10b_make_scorecard_visible.R b/tutorials/fannie_mae/10b_make_scorecard_visible.R deleted file mode 100644 index 36fb832..0000000 --- a/tutorials/fannie_mae/10b_make_scorecard_visible.R +++ /dev/null @@ -1,109 +0,0 @@ -source("inst/fannie_mae/0_setup.r") -library(disk.frame) - -acqall_val = disk.frame(file.path(outpath, "appl_mdl_data_sampled_val2")) -df = acqall_val -#the model -mdl = readRDS("model.rds") - -plot(purrr::map_dbl(mdl, ~.x$auc)) - -#' score one variable -score_one_var <- function(feature, x, bins, bias, format_fn = base::I) { - print(feature) - # numeric features (post format_fn) has only two columns in bins - if(ncol(bins) == 2) { - x1 = format_fn(x) - binsNA = evalparseglue("bins[is.na({feature}),]") - bins = evalparseglue("bins[!is.na({feature}),]") - score = bins$score[cut(x1, c(-Inf, bins[[2]], Inf))] - - if(nrow(binsNA) == 1) { - score[is.na(x1)] = binsNA[[1]] - } - } else { - x1 = format_fn(x) - x1df = evalparseglue("data.table({feature} = x1)") - x1df_scr = evalparseglue("merge(x1df, bins, by = '{feature}', all.x=T)") - score = x1df_scr$score - } - # if there are still na then assign the bias - #browser(expr = any(is.na(score))) - if(any(is.na(score))) { - warning(glue::glue("the feature `{feature}` has {sum(is.na(score))} unassigned scores out of {length(score)}, assigning them the bias = {bias}")) - score[is.na(score)] = bias - } - - score -} - - -#' Socre a scorecard -#' @param mdl the xg scorecard model -#' @param df a disk.frame -score_xg_scorecard <- function(df, mdl) { - res = furrr::future_map_dfc(mdl, ~{ - #res = purrr::map_dfc(mdl, ~{ - var = df %>% - srckeep(.x$feature) %>% - collect(parallel = F) - x = var[[1]] - feature = .x$feature - bins = .x$bins - bias = .x$bias - format_fn = .x$format_fn - score_one_var(feature, x, bins, bias, format_fn) - }) %>% rowSums - res -} - -plot_auc <- function(df) { - target = df %>% - srckeep("default_next_12m") %>% - collect(parallel = F) - - pt = proc.time() - target$score = score_xg_scorecard(df, mdl) - timetaken(pt) - - AUC = auc(target$default_next_12m, target$score) - GINI = 2*AUC-1 - - target[,negscore := -score] - setkey(target, negscore) - cts = target[,.(ctot = (1:.N)/.N, cbad = cumsum(default_next_12m)/sum(default_next_12m))] - - plot(cts[seq(1,.N, length.out=100),], type="l", xlim=c(0,1), - main=glue::glue("On Validation - AUC: {AUC %>% round(2)}; GINI: {GINI %>% round(2)}")) - abline(a=0,b=1,lty=2) - abline(h=0) - abline(v=0) -} - - -system.time(plot_auc(acqall_val)) - -# score on whole ---------------------------------------------------------- -acqall_dev = disk.frame(file.path(outpath, "appl_mdl_data_sampled_dev2")) -system.time(plot_auc(acqall_dev)) - -scorecard = map_dfr(mdl, ~{ - res = .x$bins - evalparseglue("res[,feature_lbl := as.character({.x$feature})]") - res[,variable := .x$feature] - res %>% - select(variable, feature_lbl, score ) %>% - mutate(score = round(-score*20/log(2))) -}) - -saveRDS(scorecard, "scorecard.rds") -scorecard = readRDS("scorecard.rds") -View(scorecard) - -DT::datatable(scorecard) - -# score on whole ---------------------------------------------------------- -# acqall = disk.frame(file.path(outpath, "appl_mdl_data")) -# system.time(plot_auc(acqall)) - - diff --git a/tutorials/fannie_mae/11_speedglm.R b/tutorials/fannie_mae/11_speedglm.R deleted file mode 100644 index 17b3826..0000000 --- a/tutorials/fannie_mae/11_speedglm.R +++ /dev/null @@ -1,50 +0,0 @@ -source("inst/fannie_mae/0_setup.r") -library(disk.frame) -library(speedglm) -library(biglm) - -acqall_dev = disk.frame(file.path(outpath, "appl_mdl_data_sampled_dev2")) -library(speedglm) - -head(acqall_dev) - -#' A streaming function for speedglm -#' @param df a disk.frame -stream_shglm <- function(df) { - i = 0 - is = sample(nchunks(df), replace = F) - function(reset = F) { - - if(reset) { - print("you've reset") - i <<- 0 - } else { - i <<- i + 1 - #if (i > 3) { - if (i > nchunks(df)) { - return(NULL) - } - print(glue::glue("{i}/{nchunks(df)}")) - return(get_chunk(df, is[i])) - #return(get_chunk(df, i)) - } - } -} - -acqall_dev1 = acqall_dev %>% delayed(~{ - .x[,oltv_band := cut(oltv, c(-Inf, seq(60,100,by=20)))] - .x[,dti_band := addNA(cut(dti, c(-Inf, c(20,100), Inf)))] - .x -}) - -streamacq = stream_shglm(acqall_dev1) - -m = bigglm( - default_next_12m ~ oltv_band-1, - data = streamacq, - family=binomial()) -summary(m) - -shglm(default_next_12m ~ oltv_band -1 #+ dti_band + - 1 - , - datafun = streamacq, family=binomial()) \ No newline at end of file diff --git a/tutorials/fannie_mae/12_keras.r b/tutorials/fannie_mae/12_keras.r deleted file mode 100644 index 732f02a..0000000 --- a/tutorials/fannie_mae/12_keras.r +++ /dev/null @@ -1,178 +0,0 @@ -source("inst/fannie_mae/0_setup.r") -library(disk.frame) -library(keras) - -acqall_dev = disk.frame(file.path(outpath, "appl_mdl_data_sampled_dev2")) - -#' A streaming function for speedglm -#' @param df a disk.frame -stream_shglm <- function(df) { - i = 0 - is = sample(nchunks(df), replace = F) - function(reset = F) { - - if(reset) { - print("you've reset") - i <<- 0 - } else { - i <<- i + 1 - #if (i > 4) { - if (i > nchunks(df)) { - return(NULL) - } - print(glue::glue("{i}/{nchunks(df)}")) - return(get_chunk(df, is[i])) - #return(get_chunk(df, i)) - } - } -} - -acqall_dev1 = acqall_dev %>% delayed(~{ - .x[,oltv_band := cut(oltv, c(-Inf, 60,80, Inf))] - #.x[,scr_band := addNA(cut(cscore_b, c(-Inf, 627,700, Inf)), ifany=F)] - .x[,scr_band := addNA(cut(cscore_b, c(-Inf, 700,716,725,742,748,766,794, Inf)), ifany=F)] - - .x -}) - -if(F) { - aa = acqall_dev1 %>% collect - glm(default_next_12m ~ oltv_band + scr_band - 1, data=aa) -} - -head(acqall_dev1) - -build_model <- function() { - model <- keras_model_sequential() %>% - layer_dense(units = 2, activation = 'softmax') - - - model %>% compile( - loss = "categorical_crossentropy", - optimizer = 'sgd' - ) - - model -} - -model = build_model() - -#ol = levels(acqall_dev1[,oltv_band, keep=c("oltv","cscore_b")]) -#dl = levels(acqall_dev1[,scr_band, keep=c("oltv","cscore_b")]) - -ol = levels(get_chunk(acqall_dev1,1)[,oltv_band]) -dl = levels(get_chunk(acqall_dev1,1)[,scr_band]) - - -#for(i in 1:nchunks(acqall_dev1)) { -kk <- function() { - - j = 0 - done = F - ii = 0 - while(!done) { - j <- j + 1 - si = sample(nchunks(acqall_dev1), nchunks(acqall_dev1)*0.7) - osi = setdiff(1:nchunks(acqall_dev1), si) - - system.time(a <- map_dfr(si, ~{ - - ii <- ii + 1 - i = .x - if(ii %% 20 == 0) print(glue::glue("{j}:{ii} {Sys.time()}")) - - a = get_chunk(acqall_dev1, i) - #a[,.(sum(default_next_12m), .N),oltv_band] - - a1 = - cbind( - a[,keras::to_categorical(as.integer(oltv_band) - 1)] - #,a[,keras::to_categorical(as.integer(scr_band) - 1)] - ) - - at = a[,default_next_12m*1] - Y_train = keras::to_categorical(at) - - hist = model %>% fit( - a1, - Y_train, - epochs = 1, - validation_split = 0.2, - verbose = 0 - ) - - gw = get_weights(model) - - gwdt = as.data.table(gw[1]) - setnames(gwdt, names(gwdt), c("non_default", "default")) - gwdt$i =i - gwdt$var = c( - rep("oltv_band", length(ol)) - #, rep("scr_band", length(dl)) - ) - gwdt$band = c(ol,dl - ) - gwdt - })) - - some_chunks <- map(osi, ~{ - i = .x - a = get_chunk(acqall_dev1, i) - - a1 = - cbind( - a[,keras::to_categorical(as.integer(oltv_band) -1)] - #,a[,keras::to_categorical(as.integer(scr_band)-1)] - ) - a1 - }) %>% reduce(rbind) - - outcomes <- map(osi, ~{ - i = .x - a = get_chunk(acqall_dev, i, keep=c("default_next_12m")) - a[,default_next_12m] - }) %>% unlist - - auc = auc(outcomes, predict(model, some_chunks)[,2]) - - # scores - p = predict(model, diag( - length(ol) - #+length(dl) - ))[,2] - a_b = log(p/(1-p)) - - scalar = 20/log(2) - intercept = 20*log(536870912/15)/log(2) - - a = mean(a_b) - b = a_b - a - done <- length(unique(sign(diff(b)))) == 1 - - scrs = data.table(base_scr = round(a*scalar+intercept,0), scr = round(-b*scalar,0)) - scrs$var = c( - rep("oltv_band", length(ol)) - #, rep("scr_band", length(dl)) - ) - scrs$band = c(ol - , dl - ) - scrs = scrs[band != "bias"] - - scrs = scrs[,.(var, band, scr, base_scr)] - print(glue::glue("AUC: {auc}")) - print(scrs) - } - scrs -} - -pt = proc.time() -scrs = kk() -timetaken(pt) -View(scrs) - -#AUC: 0.599790504924901 -# var band scr base_scr -# 1: oltv_band (-Inf,60] 10 351 -# 2: oltv_band (60,80] 10 351 -# 3: oltv_band (80, Inf] -21 351 \ No newline at end of file diff --git a/tutorials/fannie_mae/13_disk.r b/tutorials/fannie_mae/13_disk.r deleted file mode 100644 index 10adc92..0000000 --- a/tutorials/fannie_mae/13_disk.r +++ /dev/null @@ -1,25 +0,0 @@ -source("inst/fannie_mae/0_setup.r") -library(disk.frame) -library(keras) - -fmdf = disk.frame(file.path(outpath, "fm_with_harp")) - -head(fmdf) - -nrow(fmdf) - -pt = proc.time() -default_rate_over_time <- fmdf %>% - srckeep(c("default_12m", "monthly.rpt.prd")) %>% - group_by(monthly.rpt.prd, hard = F) %>% - summarise(ndef = sum(default_12m, na.rm=T), n = n()) %>% - collect %>% - group_by(monthly.rpt.prd) %>% - summarise(ndef = sum(ndef), n = sum(n)) %>% - mutate(default_rate = ndef/n) -timetaken(pt) - -default_rate_over_time %>% - ggplot + - geom_line(aes(x = monthly.rpt.prd, y = default_rate)) + - ggtitle("Fannie Mae Default Rate%") diff --git a/tutorials/fannie_mae/3a_get_some_data.r b/tutorials/fannie_mae/3a_get_some_data.r deleted file mode 100644 index b86128a..0000000 --- a/tutorials/fannie_mae/3a_get_some_data.r +++ /dev/null @@ -1,113 +0,0 @@ -library(disk.frame) - -source("inst/fannie_mae/0_setup.r") - -a = disk.frame(file.path(outpath, "fm_with_harp")) - -pt = proc.time() -a1 <- a %>% - srckeep(c("loan_id","monthly.rpt.prd","delq.status", "default_12m")) %>% - delayed(~{ - setkey(.x, loan_id) - uid = sample_frac(.x[,.(loan_id = unique(loan_id))], 0.01) - .x[uid,] -}) %>% collect(parallel = T) -timetaken(pt) - -a1[delq.status != "X",delq.statusn := as.numeric(delq.status)] -a1[delq.status %in% c("","X"),delq.statusn := 0] - -#a1[, delq.statusn := pmin(delq.statusn, 3)] - -a1[,.N,delq.statusn] - -a1[is.na(delq.statusn)] - -# remove those that have already defulat -a2 = a1 - -a2[is.na(default_12m), default_12m := FALSE] - -a3 = a2[,.(loan_id, monthly.rpt.prd, delq.statusn, default_12m)] -setkey(a3, loan_id, monthly.rpt.prd) - -a3[,sum(default_12m)/.N,delq.statusn] - -# create worst delq status last 12 months -system.time(a4 <- a3[, shift(delq.statusn,n=1:12), by=loan_id]) - -eval(parse(text=glue::glue("a4[,worst_delq_last_12m := pmax({paste0(paste0('V', 1:12), collapse=',')}, na.rm=T)]"))) - -a5 = bind_cols(a3, a4[,.(worst_delq_last_12m)]) - -a6 = a5#[delq.statusn < 3,] - -a6[,worst_delq_last_12m_capped := pmin(worst_delq_last_12m, 6)] - -a6[,.N, worst_delq_last_12m_capped] - -a6devid = sample_frac(a6[,.(loan_id = unique(loan_id))], 0.7) - -a6dev = a6[a6devid] -a6val = a6[!a6devid] - -a6devpd = a6dev[,.(pd = sum(default_12m)/.N), worst_delq_last_12m_capped] - -a6val = left_join(a6val, a6devpd, by = "worst_delq_last_12m_capped") -a6dev = left_join(a6dev, a6devpd, by = "worst_delq_last_12m_capped") - -setkey(a6val, loan_id, monthly.rpt.prd) -setkey(a6dev, loan_id, monthly.rpt.prd) - -for_write <- function(a6val) { - a6val1 = a6val[as.Date("2015-06-01") < monthly.rpt.prd & monthly.rpt.prd <= as.Date("2016-06-01") & !is.na(worst_delq_last_12m_capped), ] - a6val1[,N:=.N,loan_id] - - a6val2 = a6val1[N == 12,] - setkey(a6val2, loan_id, monthly.rpt.prd) - - toremove = a6val2[,.SD[12,delq.statusn], loan_id] - - toremove1 = toremove[V1 >= 3, loan_id] - - a6val2 = a6val2[!(loan_id %in% toremove1), ] - - a6val2[order(loan_id), id := rleid(loan_id)] - setkey(a6val2, loan_id, monthly.rpt.prd) - a6val2 -} - - -a6val2 <- for_write(a6val) -a6dev2 <- for_write(a6dev) - - -a6dev2[,.N, delq.statusn] -a6val2[,.N, delq.statusn] - -fwrite(a6val2[,.(id, delq_status = delq.statusn, default_12m)], "val.csv") -fwrite(a6dev2[,.(id, delq_status = delq.statusn, default_12m)], "dev.csv") - - - -gini <- function(a6val) { - setkey(a6val, id, monthly.rpt.prd) - a6val = a6val[seq(12,.N, by=12),] - a6_auc = a6val[order(worst_delq_last_12m_capped), .(bads = sum(default_12m), tots =.N), pd] - - a6_auc[,score:=-pd] - setkey(a6_auc, score) - - a6_auc[order(score), height := bads/sum(bads)] - a6_auc[order(score), width := tots/sum(tots)] - plot(a6_auc[,.(cumsum(width), cumsum(height))], type="l") - abline(0,1) - a6_auc2 = bind_rows(data.table(height=0, width=0), a6_auc) - a6_auc2[,cheight := cumsum(height)] - a6_auc2[,cwidth := cumsum(width)] - 2*a6_auc2[,sum((cheight+lag(cheight))*c(0,diff(cwidth))/2, na.rm=T)]-1 -} - -gini(a6val2) -gini(a6dev2) - diff --git a/tutorials/fannie_mae/4a_read_appl.r b/tutorials/fannie_mae/4a_read_appl.r deleted file mode 100644 index 6edcd59..0000000 --- a/tutorials/fannie_mae/4a_read_appl.r +++ /dev/null @@ -1,57 +0,0 @@ -source("tutorials/fannie_mae/00_setup.r") -library(disk.frame) - - -Acquisitions_Variables = c("LOAN_ID", "ORIG_CHN", "Seller.Name", "ORIG_RT", "ORIG_AMT", "ORIG_TRM", "ORIG_DTE" - ,"FRST_DTE", "OLTV", "OCLTV", "NUM_BO", "DTI", "CSCORE_B", "FTHB_FLG", "PURPOSE", "PROP_TYP" - ,"NUM_UNIT", "OCC_STAT", "STATE", "ZIP_3", "MI_PCT", "Product.Type", "CSCORE_C", "MI_TYPE", "RELOCATION_FLG") %>% tolower() - -Acquisition_ColClasses = c("character", "character", "character", "numeric", "numeric", "integer", "character", "character", "numeric", - "numeric", "character", "numeric", "numeric", "character", "character", "character", "character", "character", - "character", "character", "numeric", "character", "numeric", "numeric", "character") - - - -fs::dir_delete(file.path(outpath, "acq.df")) - -fmdf <- disk.frame(file.path(outpath,"fm_with_harp")) - -pt = proc.time() -acq <- zip_to_disk.frame( - acqzip_file_path, - outdir = file.path(outpath, "acq.df"), - col.names = Acquisitions_Variables, - colClasses = Acquisition_ColClasses, - shardby = "loan_id", - nchunks = nchunks(fmdf)) -timetaken(pt) - -pt = proc.time() -acqall <- rbindlist.disk.frame(acq, outdir = file.path(outpath, "acq_all.df")) -timetaken(pt) - -# took 50 minutes -pt = proc.time() -fmdf_all = left_join( - fmdf, - acqall, - by = "loan_id", - merge_by_chunk_id = T, - outdir = file.path(outpath, "fmdf_appl") - ) -timetaken(pt) - -acqall = disk.frame(file.path(outpath, "acq_all.df")) - -fmdf <- disk.frame(file.path(outpath,"fm_with_harp")) - - -for(i in 1:nchunks(fmdf)) { - print(i) - a1 = get_chunk(acqall,i, keep="loan_id") %>% unique - a2 = get_chunk(fmdf,i, keep="loan_id") %>% unique - stopifnot(nrow(a1) == nrow(a2)) - stopifnot(nrow(setdiff(a1,a2)) == 0) - stopifnot(nrow(setdiff(a2,a1)) == 0) -} - diff --git a/tutorials/fannie_mae/4b_exploratory.R b/tutorials/fannie_mae/4b_exploratory.R deleted file mode 100644 index 1cf4382..0000000 --- a/tutorials/fannie_mae/4b_exploratory.R +++ /dev/null @@ -1,92 +0,0 @@ -source("inst/fannie_mae/0_setup.r") -library(disk.frame) -#future::plan(multiprocess(workers = availableCores())) - -fmdf_all = disk.frame(file.path(outpath, "fmdf_appl")) - -head(fmdf_all) - -# how many delq.status are there ------------------------------------------ -# system.time(dscnt <- fmdf_all[,.N,delq.status, keep="delq.status"]) -#dscnt[order(delq.status),sum(N),delq.status %>% as.integer] - -system.time(dscnt2 <- fmdf_all %>% - srckeep(c("delq.status", "default_12m", "harp_12m")) %>% - group_by(delq.status, hard = F) %>% - summarise(n = n(), ndef = sum(default_12m, na.rm=T), nharp = sum(harp_12m, na.rm=T)) %>% - collect(parallel = T) %>% - group_by(delq.status) %>% - summarise(n = sum(n), ndef = sum(ndef), nharp = sum(nharp)) %>% - arrange(delq.status)) - - -dscnt2[delq.status %in% c("X","","0","1","2"),.(delq.status, ndef/n)] - -# how many default and go to HARP ----------------------------------------- -# system.time(hd <- fmdf_all[,.N, .(harp_12m,default_12m), keep=c("harp_12m","default_12m")]) -# hd[,.(N = sum(N)), .(harp_12m, default_12m)] - -# 45 seconds -system.time(hd <- fmdf_all %>% - srckeep(c("harp_12m", "default_12m", "delq.status")) %>% - filter(delq.status %in% c("X","","0","1","2")) %>% - group_by(harp_12m, default_12m, hard = F) %>% - summarise(n = n()) %>% - collect(parallel = T) %>% - group_by(harp_12m, default_12m) %>% - summarise(n = sum(n))) - -# Source: local data table [4 x 3] -# Groups: harp_12m -# -# # A tibble: 4 x 3 -# harp_12m default_12m n -# -# 1 NA NA 1765366501 -# 2 TRUE NA 12343954 -# 3 NA TRUE 22366382 -# 4 TRUE TRUE 638 - -# how many accounts eventually default ------------------------------------ -#future::plan(multiprocess(workers = availableCores())) -pt = proc.time() # 4:26 -simple2 = fmdf_all %>% - srckeep(c("oltv", "default_12m", "harp_12m", "loan_id", "orig_dte", "delq.status")) %>% - #filter(delq.status %in% c("0","X","1","2")) %>% - mutate(orig_yr = substr(orig_dte, 4,7)) %>% - group_by(oltv, loan_id, orig_yr, hard = F) %>% - summarise(ndef = sum(default_12m, na.rm=T), n = n()) %>% - collect(parallel = T) %>% - group_by(oltv, loan_id, orig_yr) %>% - summarise(ndef = sum(ndef), n = sum(n)) -timetaken(pt) - -stopifnot(simple2[,n_distinct(loan_id) == .N]) - -#simple3 = simple2[,.(ndef = sum(ndef > 0), .N), .(orig_yr_band = cut(as.numeric(orig_yr),c(-Inf, 2005, 2007, 2008,2009,Inf)), oltv_band = ceiling(oltv/10)*10)] - -simple3 = simple2[,.(ndef = sum(ndef > 0), .N), .(orig_yr, oltv_band = cut(oltv, c(-Inf, seq(0,80,by=20),Inf)))] - -simple3 %>% - mutate(odr = ndef/N) %>% - select(oltv_band, odr, orig_yr) %>% - spread(key = oltv_band, value = odr) - -simple3 %>% - filter(!is.na(oltv_band)) %>% - mutate(odr = ndef/N) %>% - arrange(oltv_band) %>% - ggplot + - geom_bar( - aes( - x = as.factor(oltv_band), - weight = odr, - colour = orig_yr), - position = "dodge") - - - - - - - diff --git a/tutorials/fannie_mae/4c_explore_xgboost.r b/tutorials/fannie_mae/4c_explore_xgboost.r deleted file mode 100644 index f6a53cb..0000000 --- a/tutorials/fannie_mae/4c_explore_xgboost.r +++ /dev/null @@ -1,44 +0,0 @@ -source("inst/fannie_mae/0_setup.r") -library(disk.frame) - -vars_to_keep = c( - Performance_Variables[1:15] %>% tolower, - Acquisitions_Variables[-1] %>% tolower, - c("default_12m", "harp_12m") -) %>% unique - -# create a new variable dh12m which is a concatenation of default in next 12 months or going into harp next 12 months -fmdf_all = disk.frame(file.path(outpath, "fmdf_appl")) %>% - srckeep(vars_to_keep) %>% - delayed(~{ - .x[,dh12m := default_12m | harp_12m] - .x[is.na(dh12m), dh12m:=F] - .x - }) - -fmdf1 = disk.frame(file.path(outpath, "fmdf_appl")) - -# took 2 mins -system.time(uid <- unique(fmdf1[substr(orig_dte ,4,7) >= "2014",unique(loan_id), keep=c("loan_id","orig_dte")])) - -# set.seed(1) -# suid = sample(uid, length(uid)/100) - -uiddf = data.table(loan_id = uid) -setkey(uiddf, loan_id) - -#fmdf_all = disk.frame(file.path(outpath, "fmdf_appl")) -# took about 10 minutes -system.time(fmdf_2yr <- fmdf_all %>% - map.disk.frame(~{ - setkey(.x, loan_id) - merge(.x, uiddf, by="loan_id") - }, - lazy = F, - outdir = file.path(outpath,"fmdf_2yr"), - overwrite = T)) - -# fmdf_2yr = disk.frame(file.path(outpath,"fmdf_2yr")) -system.time(sfmdf_2yr <- sample_frac(fmdf_2yr, 1) %>% collect(parallel = F)) - - diff --git a/tutorials/fannie_mae/4d_xgboost.r b/tutorials/fannie_mae/4d_xgboost.r deleted file mode 100644 index 248e676..0000000 --- a/tutorials/fannie_mae/4d_xgboost.r +++ /dev/null @@ -1,151 +0,0 @@ -source("inst/fannie_mae/0_setup.r") -library(disk.frame) -fmdf_2yr = disk.frame(file.path(outpath,"fmdf_2yr")) - -system.time(xy <- fmdf_2yr[,.(dh12m, oltv),keep=c("dh12m","oltv")]) - - -# show how to binning ----------------------------------------------------- -system.time(xy <- fmdf_2yr[,c("dh12m", "oltv"), keep=c("dh12m","oltv")]) - -dtrain <- xgb.DMatrix(label = xy$dh12m, data = as.matrix(xy[,"oltv"])) - -pt = proc.time() -m <- xgboost( - data=dtrain, - nrounds = 1, - objective = "binary:logitraw", - tree_method="exact", - monotone_constraints = 1, - base_score = 32662/10067411) -timetaken(pt) - - -map_chr(xgb.dump(m), ~str_extract(.x,"\\[f0<[\\d]+\\.[\\d]+\\]")) %>% - keep(~!is.na(.x)) %>% - map_dbl(~str_extract(.x, "[\\d]+\\.[\\d]+") %>% as.numeric) %>% - sort %>% - floor -> xy[,"oltv"] - -bb = xy[,sum(dh12m)/.N, .(bins = cut(oltv,c(-Inf,bins,Inf)))] - -setkey(bb, bins) - -barplot(height = bb$V1, names.arg = bb$bins) - -pt = proc.time() -m2 <- xgboost( - data=x, - label = y, - nrounds = 3, - objective = "binary:logitraw", - tree_method="exact", - monotone_constraints = 1, - base_score = 32662/10067411) -timetaken(pt) - - -map_chr(xgb.dump(m2), ~str_extract(.x,"\\[f0<[\\d]+\\.[\\d]+\\]")) %>% - keep(~!is.na(.x)) %>% - map_dbl(~str_extract(.x, "[\\d]+\\.[\\d]+") %>% as.numeric) %>% - sort %>% - floor %>% - unique -> bins - -bb = xy[,sum(dh12m)/.N, .(bins = cut(oltv,c(-Inf,bins,Inf)))] - -setkey(bb, bins) - -barplot(height = bb$V1, names.arg = bb$bins) - -print(length(bins)) - -xgb.plot.tree(model = m) - - -# with penalty ------------------------------------------------------------ -system.time(xy <- fmdf_2yr[,.(dh12m, oltv),keep=c("dh12m","oltv")]) - -dtrain <- xgb.DMatrix(label = xy$dh12m, data = as.matrix(xy$oltv)) - -system.time(xycv <- - xgb.cv(dtrain, objective="binary:logitraw", nfold = 5, nrounds=1)) -xycv - -pt = proc.time() -m <- xgboost( - data=dtrain, - nrounds = 1, - objective = "binary:logitraw", - tree_method="exact", - monotone_constraints = 1, - base_score = 32662/10067411 -) -timetaken(pt) - - -map_chr(xgb.dump(m), ~str_extract(.x,"\\[f0<[\\d]+\\.[\\d]+\\]")) %>% - keep(~!is.na(.x)) %>% - map_dbl(~str_extract(.x, "[\\d]+\\.[\\d]+") %>% as.numeric) %>% - sort %>% - floor -> bins - -bb = xy[,sum(dh12m)/.N, .(bins = cut(oltv,c(-Inf,bins,Inf)))] - -setkey(bb, bins) - -barplot(height = bb$V1, names.arg = bb$bins) - -xgb.plot.tree(model = m) - -# do a variable with NA --------------------------------------------------- -system.time(xy <- fmdf_2yr[,.(dh12m, mi_pct),keep=c("dh12m","mi_pct")]) - -dtrain <- xgb.DMatrix(label = xy$dh12m, data = as.matrix(xy$mi_pct)) - -pt = proc.time() -m <- xgboost( - data= dtrain, - nrounds = 1, - objective = "binary:logitraw", - tree_method="exact", - monotone_constraints = 1, - base_score = 32662/10067411) -timetaken(pt) - - -map_chr(xgb.dump(m), ~str_extract(.x,"\\[f0<[\\d]+\\.[\\d]+\\]")) %>% - keep(~!is.na(.x)) %>% - map_dbl(~str_extract(.x, "[\\d]+[\\.[\\d]+]") %>% as.numeric) %>% - sort %>% - floor %>% - unique -> bins - -bb = xy[,sum(dh12m)/.N, .(bins = cut(mi_pct,c(-Inf,bins,Inf)) %>% addNA)] - -setkey(bb, bins) - -barplot(height = bb$V1, names.arg = bb$bins) - - -# do other stuff ---------------------------------------------------------- - - -# took 142 -a = select_if(as.tibble(sfmdf_2yr), is.numeric) %>% - select(-c(mi_type)) - -system.time(x <- as.matrix(a)) - - -dtrain <- xgb.DMatrix(label = sfmdf_2yr$dh12m, data = x) - - -xgboost( - data=x, - label = sfmdf_2yr$dh12m, - nrounds = 2, - objective = "binary:logitraw", - tree_method="exact") - - diff --git a/tutorials/fannie_mae/5a_appl_model.r b/tutorials/fannie_mae/5a_appl_model.r deleted file mode 100644 index 2c7d97e..0000000 --- a/tutorials/fannie_mae/5a_appl_model.r +++ /dev/null @@ -1,53 +0,0 @@ -source("tutorials/fannie_mae/00_setup.r") -library(disk.frame) - -defaults = disk.frame(file.path(outpath,"defaults.df")) %>% - srckeep(c("loan_id", "start_date")) %>% - collect - -defaults[,first_default_date := start_date] -month(defaults$first_default_date) <- month(defaults$first_default_date)+12 - -defaults = defaults[,.(loan_id, first_default_date)] - -acqall = disk.frame(file.path(outpath,"acq_all.df")) - -# 1:14 -pt <- proc.time() -acqall1 <- left_join( - acqall, - defaults, - by = "loan_id") %>% - delayed(~{ - library(lubridate) - .x[ - !is.na(first_default_date), - mths_to_1st_default := interval( - as.Date(paste0("01/", frst_dte),"%d/%m/%Y"), - first_default_date) - %/% months(1)] - - .x[,default_next_12m := F] - .x[!is.na(first_default_date), default_next_12m := mths_to_1st_default <= 12] - - res = .x[substr(frst_dte,4,7) < 2016, ] - - res - }) %>% - compute(outdir = file.path(outpath, "appl_mdl_data"), overwrite = T) -timetaken(pt) - -# default rate by year of origination -system.time(drbyyr <- acqall1 %>% - srckeep(c("frst_dte", "default_next_12m")) %>% - mutate(frst_yr = substr(frst_dte, 4, 7)) %>% - group_by(frst_yr, hard = F) %>% - summarise(ndef = sum(default_next_12m, na.rm=T), n = n()) %>% - collect(parallel = T) %>% - group_by(frst_yr) %>% - summarise(ndef = sum(ndef), n = sum(n)) %>% - mutate(odr = ndef/n)) - -drbyyr %>% - ggplot + - geom_line(aes(x = frst_yr %>% as.numeric, y = odr)) diff --git a/tutorials/fannie_mae/5b_one_var.r b/tutorials/fannie_mae/5b_one_var.r deleted file mode 100644 index f95f221..0000000 --- a/tutorials/fannie_mae/5b_one_var.r +++ /dev/null @@ -1,189 +0,0 @@ -source("tutorials/fannie_mae/00_setup.r") -library(disk.frame) - -acqall1 = disk.frame(file.path(outpath, "appl_mdl_data")) - -# XGBoost on one var ------------------------------------------------------ -system.time(xy <- acqall1[,c("default_next_12m", "oltv"), keep=c("default_next_12m", "oltv")]) - -# transform into format accepted by XGBoost -dtrain <- xgb.DMatrix(label = xy$default_next_12m, data = as.matrix(xy[,"oltv"])) - -portfolio_default_rate = xy[,sum(default_next_12m, na.rm = T)/.N] - -pt = proc.time() -m <- xgboost( - data=dtrain, - nrounds = 1, - objective = "binary:logitraw", - tree_method="exact", - monotone_constraints = 1, - base_score = portfolio_default_rate) -timetaken(pt) - -prev_pred = predict(m, dtrain) - -map_chr(xgb.dump(m), ~str_extract(.x,"\\[f0<[\\d]+\\.[\\d]+\\]")) %>% - keep(~!is.na(.x)) %>% - map_dbl(~str_extract(.x, "[\\d]+\\.[\\d]+") %>% as.numeric) %>% - sort %>% - floor -> bins - -bb = xy[,.(ndef = sum(default_next_12m), .N, m1 = min(oltv), m2 = max(oltv)), .(bins = cut(oltv,c(-Inf,bins,Inf)))] -new_bins = sort(unique( bb$m2)) -bb = xy[,.(ndef = sum(default_next_12m), .N), .(bins = cut(oltv,c(-Inf,new_bins,Inf)))][order(bins)] - -setkey(bb, bins) -bb %>% - filter(!is.na(bins)) %>% - mutate(`Orig LTV Band` = bins, `Default Rate%` = ndef/N) %>% - ggplot + - geom_bar(aes(x = `Orig LTV Band`, y = `Default Rate%`), stat = 'identity') + - coord_flip() - -if(F) { - system.time(xyz <- acqall1[,c("default_next_12m", "oltv", "frst_dte"), keep=c("default_next_12m", "oltv", "frst_dte")]) - - xyz[,frst_yr := substr(frst_dte,4,7) %>% as.integer] - xyz[,frst_dte:=NULL] - - bb = xyz[,.(ndef = sum(default_next_12m), .N), .(bins = cut(oltv,c(-Inf,bins,Inf)), frst_yr)] - - bb[,binton := sum(N), bins] - bb[,dr := ndef/binton] - - setkey(bb, bins) - bb[order(frst_yr,decreasing = T),text_y_pos := cumsum(dr) - dr/2, bins] - bb[,text := substr(frst_yr, 3,4)] - - bb %>% - filter(!is.na(bins)) %>% - mutate(`Yr of Orig` = as.factor(frst_yr), `Orig LTV Band` = bins, `Default Rate%` = dr) %>% - ggplot + - geom_bar(aes(x = `Orig LTV Band`, y = `Default Rate%`, fill = `Yr of Orig`), stat = 'identity') + - geom_text(aes(x = `Orig LTV Band`, y = text_y_pos, label = text)) + - coord_flip() -} - - -# what if we fit them in rounded number --------------------------------------------------------- - -xy[oltv > 80, oltv_round := ceiling(oltv/5)*5] -xy[oltv <= 80, oltv_round := ceiling(oltv/10)*10] -xy[oltv <= 40, oltv_round := ceiling(oltv/20)*20] -#xy = xy[!is.na(oltv_round),] - -xy[is.na(oltv_round),] - -xy[,.N, oltv_round] - -# transform into format accepted by XGBoost -dtrain <- xgb.DMatrix(label = xy$default_next_12m, data = as.matrix(xy[,"oltv_round"])) - -pt = proc.time() -m <- xgboost( - data=dtrain, - nrounds = 1, - objective = "binary:logitraw", - tree_method="exact", - monotone_constraints = 1, - base_score = portfolio_default_rate) -timetaken(pt) - -map_chr(xgb.dump(m), ~str_extract(.x,"\\[f0<[\\d]+[\\.]{0,1}[\\d]+\\]")) %>% - keep(~!is.na(.x)) %>% - map_dbl(~str_extract(.x, "[\\d]+[\\.]{0,1}[\\d]+") %>% as.numeric) %>% - sort -> bins - -bb = xy[,.(ndef = sum(default_next_12m), .N, m1 = min(oltv_round), m2 = max(oltv_round)), .(bins = cut(oltv_round,c(-Inf,bins,Inf)))] -new_bins = sort(unique( bb$m2)) -bb = xy[,.(ndef = sum(default_next_12m), .N), .(bins = cut(oltv_round,c(-Inf,new_bins,Inf)))][order(bins)] -bb[,odr := ndef/N] - -setkey(bb, bins) -bb %>% - filter(!is.na(bins)) %>% - mutate(`Orig LTV Band` = bins, `Default Rate%` = ndef/N) %>% - ggplot + - geom_bar(aes(x = `Orig LTV Band`, y = `Default Rate%`), stat = 'identity') + - coord_flip() - - -prev_pred = predict(m, dtrain) - -prev_pred1 = predict(m, dtrain, predcontrib=T) - -# add another variable ---------------------------------------------------- -target = "default_next_12m" -feature = "orig_amt" -df = acqall1 -format_fn = base::I -existing_model = prev_pred -monotone_constraints = -1 - -auc <- function(target, score) { - df = data.table(target, score) - - df1 = df[,.(nt = sum(target), n = .N, score)] - setkey(df1, score) -} - -add_var_to_scorecard <- function(df, target, feature, monotone_constraints = 0, prev_pred = NULL, format_fn = base::I) { - - xy = df %>% - srckeep(c(target, feature)) %>% - collect(parallel = T) - - # evaluate - code = glue::glue("xy = xy %>% mutate({feature} = format_fn({feature}))") - eval(parse(text = code)) - - dtrain <- xgb.DMatrix(label = xy[,target, with = F][[1]], data = as.matrix(xy[,c(feature), with = F])) - - if(is.null(prev_pred)) { - pt = proc.time() - m2 <- xgboost( - data=dtrain, - nrounds = 1, - objective = "binary:logitraw", - tree_method="exact", - monotone_constraints = monotone_constraints - ) - timetaken(pt) - } else { - setinfo(dtrain, "base_margin", prev_pred) - pt = proc.time() - m2 <- xgboost( - data=dtrain, - nrounds = 1, - objective = "binary:logitraw", - tree_method="exact", - monotone_constraints = monotone_constraints - ) - timetaken(pt) - - a2 = predict(m2, dtrain) - a3 = predict(m2, dtrain, predcontrib = T) - } - - map_chr(xgb.dump(m2), ~str_extract(.x,"\\[f0<[\\d]+[\\.]{0,1}[\\d]+\\]")) %>% - keep(~!is.na(.x)) %>% - map_dbl(~str_extract(.x, "[\\d]+[\\.]{0,1}[\\d]+") %>% as.numeric) %>% - sort -> bins - - code = glue::glue("bb = xy[,.(ndef = sum({target}), .N, m1 = min({feature}), m2 = max({feature})), .(bins = cut({feature},c(-Inf,bins,Inf)))]") - eval(parse(text = code)) - - new_bins = sort(unique(bb$m2)) - code1 = glue::glue("bb = xy[,.(ndef = sum(default_next_12m), .N), .(bins = cut({feature},c(-Inf,new_bins,Inf)))][order(bins)]") - eval(parse(text = code1)) - - setkey(bb, bins) - bb %>% - filter(!is.na(bins)) %>% - mutate(`Orig LTV Band` = bins, `Default Rate%` = ndef/N) %>% - ggplot + - geom_bar(aes(x = `Orig LTV Band`, y = `Default Rate%`), stat = 'identity') + - coord_flip() -} - diff --git a/tutorials/fannie_mae/5c_using_a_fn.r b/tutorials/fannie_mae/5c_using_a_fn.r deleted file mode 100644 index e083fa5..0000000 --- a/tutorials/fannie_mae/5c_using_a_fn.r +++ /dev/null @@ -1,54 +0,0 @@ -source("inst/fannie_mae/0_setup.r") -library(disk.frame) - -acqall1 = disk.frame(file.path(outpath, "appl_mdl_data")) - -pt <- proc.time() -firstm <- add_var_to_scorecard(acqall1, "default_next_12m", "oltv", monotone_constraints = 1, format_fn = function(v) { - ceiling(v / 5) * 5 -}) -timetaken(pt) - -# now add the second variable --------------------------------------------- -pt <- proc.time() -secondm <- add_var_to_scorecard( - acqall1, - "default_next_12m", - "dti", - prev_pred = firstm$prev_pred, - monotone_constraints = 1) -timetaken(pt) - -pt <- proc.time() -thirdm <- add_var_to_scorecard( - acqall1, - "default_next_12m", - "orig_trm", - prev_pred = secondm$prev_pred, - monotone_constraints = 1) -timetaken(pt) - -pt <- proc.time() -fourthm <- add_var_to_scorecard( - acqall1, - "default_next_12m", - "num_bo", - prev_pred = thirdm$prev_pred, - monotone_constraints = -1, - format_fn = as.numeric) -timetaken(pt) - -pt <- proc.time() -fifthm <- add_var_to_scorecard( - acqall1, - "default_next_12m", - "cscore_b", - prev_pred = fourthm$prev_pred, - monotone_constraints = -1) -timetaken(pt) - - -# plot improvement -------------------------------------------------------- -plot(map_dbl(list(firstm, secondm, thirdm, fourthm, fifthm), ~.x$auc), type="b", ylim = c(0.4,0.9)) -abline(h = 0.7, lty = 2) - diff --git a/tutorials/fannie_mae/5d_AutoML.r b/tutorials/fannie_mae/5d_AutoML.r deleted file mode 100644 index 7525c87..0000000 --- a/tutorials/fannie_mae/5d_AutoML.r +++ /dev/null @@ -1,34 +0,0 @@ -source("inst/fannie_mae/0_setup.r") -library(disk.frame) -acqall = disk.frame(file.path(outpath, "appl_mdl_data")) - -pt = proc.time() -acqall_all = acqall %>% - map.disk.frame(~{ - defs = .x[default_next_12m == T,] - defs[,weight:=1] - non_defs = .x[default_next_12m == F, ] - non_defs[,weight:=10] - - rbindlist(list(defs, sample_frac(non_defs, 0.1)), fill = T, use.names = T) - }, lazy = F, outdir = file.path(outpath, "appl_mdl_data_sampled"), overwrite = T) -timetaken(pt) - -#acqall_all = disk.frame(file.path(outpath, "appl_mdl_data_sampled")) - -acqall_dev = sample_frac(acqall_all, 0.7) %>% - write_disk.frame(file.path(outpath, "appl_mdl_data_sampled_dev")) - -#acqall_dev = disk.frame(file.path(outpath, "appl_mdl_data_sampled_dev")) - - -uid_dev = acqall_dev %>% - srckeep("loan_id") %>% - mutate(loan_id = unique(loan_id)) %>% - collect(parallel = T) %>% - mutate(loan_id = unique(loan_id)) - - -acqall_val = acqall_all %>% - anti_join(uid_dev, by = "loan_id") %>% - write_disk.frame(file.path(outpath, "appl_mdl_data_sampled_val")) diff --git a/tutorials/fannie_mae/5e_AutoML_actual.r b/tutorials/fannie_mae/5e_AutoML_actual.r deleted file mode 100644 index 1e59154..0000000 --- a/tutorials/fannie_mae/5e_AutoML_actual.r +++ /dev/null @@ -1,90 +0,0 @@ -source("inst/fannie_mae/0_setup.r") -library(disk.frame) - -acqall_dev = disk.frame(file.path(outpath, "appl_mdl_data_sampled_dev2")) - - -# 5d develop a function to test all variables -check_which_is_best <- function(df, target, features, monotone_constraints, format_fns, weight=NULL) { - - prev_pred = NULL - ws = NULL - vars_scr = NULL - - while(length(features) > 0) { - # try every var - - res = furrr::future_map(1:length(features), ~add_var_to_scorecard( - #res = purrr::map(1:length(features), ~add_var_to_scorecard( - df, - target, - features[.x], - monotone_constraints = monotone_constraints[.x], - prev_pred = prev_pred, - format_fn = format_fns[[.x]], - weight, - save_model_fname = glue::glue("{features[.x]}.xgbm") - )) - - # which has the best auc - w = which.max(map_dbl(res, ~.x$auc)) - ws = c(ws, w) - feature_to_fit = features[w] - print(glue::glue("choosen: {feature_to_fit}")) - - var_scr1 = res[[w]] - eval(parse(text = glue::glue("vars_scr = c(vars_scr, list({feature_to_fit} = var_scr1))"))) - - features = features[-w] - format_fns = format_fns[-w] - monotone_constraints = monotone_constraints[-w] - - prev_pred = var_scr1$prev_pred - } - vars_scr -} - -num_vars = c("mi_pct", "orig_amt", "orig_rt", "ocltv", "mi_type", "cscore_c", "oltv", "dti", "orig_trm", "cscore_b", "num_bo", "num_unit", "orig_dte", "frst_dte") -num_vars_mon = c(0 , 1 , 1 , 1 , 0 , -1 , 1 , 1 , 1 , -1 , -1 , 0 , 0 , 0) -num_var_fmt_fn = c(map(1:10, ~base::I), map(1:2, ~as.numeric), map(1:2, ~function(x) { - as.numeric(substr(x, 4, 7)) -})) - -cat_vars = setdiff(names(acqall_dev), num_vars) %>% - setdiff( - c("loan_id", "default_next_12m", "first_default_date", "mths_to_1st_default", "zip_3", "weight")) - -pt = proc.time() -res_all = check_which_is_best( - acqall_dev, - "default_next_12m", - features = c(num_vars, cat_vars), - monotone_constraints = c(num_vars_mon, rep(0, length(cat_vars))), - format_fns = c(num_var_fmt_fn, map(1:length(cat_vars), ~base::I)), - weight = "weight" -) -timetaken(pt) - -plot(map_dbl(res_all, ~.x$auc)) - -saveRDS(res_all, "model.rds") - -if(F) { - rescat = check_which_is_best( - acqall_dev, - "default_next_12m", - c("seller.name", "oltv", "state", "dti", "mi_type"), - c(0 , 1 , 0 , 1 , 0 ), - map(1:5, ~base::I) - ) - - - system.time(res <- check_which_is_best( - acqall1, - "default_next_12m", - c("ocltv", "cscore_c", "mi_type", "oltv", "dti", "orig_trm", "cscore_b", "num_bo"), - c(1 , -1 , 0 , 1 , 1 , 1 , -1 , -1 ), - c(map(1:7, ~base::I), list(as.numeric)))) -} - - diff --git a/tutorials/fannie_mae/5f_make_scorecard_visible.R b/tutorials/fannie_mae/5f_make_scorecard_visible.R deleted file mode 100644 index 2d696a2..0000000 --- a/tutorials/fannie_mae/5f_make_scorecard_visible.R +++ /dev/null @@ -1,94 +0,0 @@ -source("inst/fannie_mae/0_setup.r") -library(disk.frame) - -acqall_val = disk.frame(file.path(outpath, "appl_mdl_data_sampled_val")) - -#the model -mdl = readRDS("model.rds") - -plot(purrr::map_dbl(mdl, ~.x$auc)) - -#' score one variable -score_one_var <- function(feature, x, bins, bias, format_fn = base::I) { - print(feature) - # numeric features (post format_fn) has only two columns in bins - if(ncol(bins) == 2) { - x1 = format_fn(x) - binsNA = evalparseglue("bins[is.na({feature}),]") - bins = evalparseglue("bins[!is.na({feature}),]") - score = bins$score[cut(x1, c(-Inf, bins[[2]], Inf))] - - if(nrow(binsNA) == 1) { - score[is.na(x1)] = binsNA[[1]] - } - } else { - x1 = format_fn(x) - x1df = evalparseglue("data.table({feature} = x1)") - x1df_scr = evalparseglue("merge(x1df, bins, by = '{feature}', all.x=T)") - score = x1df_scr$score - } - # if there are still na then assign the bias - #browser(expr = any(is.na(score))) - if(any(is.na(score))) { - warning(glue::glue("the feature `{feature}` has {sum(is.na(score))} unassigned scores out of {length(score)}, assigning them the bias = {bias}")) - score[is.na(score)] = bias - } - - score -} - - -#' Socre a scorecard -#' @param mdl the xg scorecard model -#' @param df a disk.frame -score_xg_scorecard <- function(df, mdl) { - res = furrr::future_map_dfc(mdl, ~{ - #res = purrr::map_dfc(mdl, ~{ - var = df %>% - srckeep(.x$feature) %>% - collect(parallel = F) - x = var[[1]] - feature = .x$feature - bins = .x$bins - bias = .x$bias - format_fn = .x$format_fn - score_one_var(feature, x, bins, bias, format_fn) - }) %>% rowSums - res -} - -plot_auc <- function(df) { - target = df %>% - srckeep("default_next_12m") %>% - collect(parallel = F) - - pt = proc.time() - target$score = score_xg_scorecard(df, mdl) - timetaken(pt) - - AUC = auc(target$default_next_12m, target$score) - GINI = 2*AUC-1 - - target[,negscore := -score] - setkey(target, negscore) - cts = target[,.(ctot = (1:.N)/.N, cbad = cumsum(default_next_12m)/sum(default_next_12m))] - - plot(cts[seq(1,.N, length.out=100),], type="l", xlim=c(0,1), - main=glue::glue("On Validation - AUC: {AUC %>% round(2)}; GINI: {GINI %>% round(2)}")) - abline(a=0,b=1,lty=2) - abline(h=0) - abline(v=0) -} - -system.time(plot_auc(acqall_val)) - -# score on whole ---------------------------------------------------------- -acqall_dev = disk.frame(file.path(outpath, "appl_mdl_data_sampled_dev")) -system.time(plot_auc(acqall_dev)) - - -# score on whole ---------------------------------------------------------- -acqall = disk.frame(file.path(outpath, "appl_mdl_data")) -system.time(plot_auc(acqall)) - - diff --git a/tutorials/fannie_mae/6a_origination_year.r b/tutorials/fannie_mae/6a_origination_year.r deleted file mode 100644 index 8115a78..0000000 --- a/tutorials/fannie_mae/6a_origination_year.r +++ /dev/null @@ -1,31 +0,0 @@ - -acqall2 = acqall1 %>% - srckeep(c("orig_dte", "first_default_date")) %>% - mutate(orig_yr = substr(orig_dte, 4,7), yr_1st_d = year(first_default_date)) %>% - group_by(orig_yr, yr_1st_d, hard = F) %>% - summarise(n=n()) %>% - collect %>% - group_by(orig_yr, yr_1st_d) %>% - summarise(n = sum(n)) - -acqall2[,tot_n := sum(n), orig_yr] - -acqall3 <- acqall2[!is.na(yr_1st_d),] - -acqall3[,dr := n/tot_n] - -acqall3 %>% - filter(orig_yr > 1999) %>% - mutate(`Origination Year` = orig_yr) %>% - ggplot + - geom_line(aes(x = yr_1st_d, y = dr, colour = `Origination Year`)) + - xlab("Year of Observation") + - ylab("Ratio of defaulted accounts vs # of accts at orig") + - scale_x_continuous(breaks=2000:2017, labels=as.character(2000:2017)) + - scale_y_continuous(expand = c(0, 0)) + - ggtitle("Fannie Mae Single Family Loans: Ratio of defaults vs # of accounts in same year of origination") - - - - - diff --git a/tutorials/fannie_mae/7a_speedglm.r b/tutorials/fannie_mae/7a_speedglm.r deleted file mode 100644 index 974023b..0000000 --- a/tutorials/fannie_mae/7a_speedglm.r +++ /dev/null @@ -1,75 +0,0 @@ -source("inst/fannie_mae/0_setup.r") -library(disk.frame) - -acqall_dev = disk.frame(file.path(outpath, "appl_mdl_data_sampled_dev2")) - -library(speedglm) - -head(acqall_dev) - -#' A streaming function for speedglm -#' @param df a disk.frame -stream_shglm <- function(df) { - i = 0 - is = sample(nchunks(df), replace = F) - function(reset = F) { - - if(reset) { - print("you've reset") - i <<- 0 - } else { - i <<- i + 1 - #if (i > 4) { - if (i > nchunks(df)) { - return(NULL) - } - print(glue::glue("{i}/{nchunks(df)}")) - return(get_chunk(df, is[i])) - #return(get_chunk(df, i)) - } - } -} - -acqall_dev1 = acqall_dev %>% delayed(~{ - .x[,oltv_band := cut(oltv, c(-Inf, seq(0,100,by=10)))] - .x[,dti_band := addNA(cut(dti, c(-Inf, seq(0,100,by=10), Inf)))] - .x -}) - -library(speedglm) -library(biglm) - -shglm(default_next_12m ~ oltv_band-1, - datafun = streamacq, family=binomial()) - -bigglm(default_next_12m ~ oltv_band-1, data = streamacq, family=binomial()) - -make.data<-function(filename, chunksize,...){ - conn<-NULL - function(reset=FALSE){ - if(reset){ - if(!is.null(conn)) - close(conn) - conn<<-file(filename,open="r")} else{ - rval<-read.table(conn, nrows=chunksize,...) - if ((nrow(rval)==0)) { - close(conn) - conn<<-NULL - rval<-NULL} - return(rval)}}} -# data1 is a small toy dataset -data(data1) -write.table(data1,"data1.txt",row.names=FALSE,col.names=FALSE) -rm(data1) -da<-make.data("data1.txt",chunksize=50,col.names=c("y","fat1","x1","x2")) -# Caution! make sure to close the connection once you have run command #1 -da(reset=T) #1: opens the connection to "data1.txt" -da(reset=F) #2: reads the first 50 rows (out of 100) of the dataset -da(reset=F) #3: reads the second 50 rows (out of 100) of the dataset -da(reset=F) #4: is NULL: this latter command closes the connectionrequire(biglm)# fat1 is a factor with four levels - -b1<-shglm(y~factor(fat1)+x1,weights=~I(x2^2),datafun=da,family=Gamma(log)) -b2<-bigglm(y~factor(fat1)+x1,weights=~I(x2^2),data=da,family=Gamma(log)) -summary(b1) -summary(b2) - diff --git a/tutorials/fannie_mae/8a_keras.r b/tutorials/fannie_mae/8a_keras.r deleted file mode 100644 index ef2e148..0000000 --- a/tutorials/fannie_mae/8a_keras.r +++ /dev/null @@ -1,153 +0,0 @@ -source("inst/fannie_mae/0_setup.r") -library(disk.frame) -library(keras) - -acqall_dev = disk.frame(file.path(outpath, "appl_mdl_data_sampled_dev")) - -#' A streaming function for speedglm -#' @param df a disk.frame -stream_shglm <- function(df) { - i = 0 - is = sample(nchunks(df), replace = F) - function(reset = F) { - - if(reset) { - print("you've reset") - i <<- 0 - } else { - i <<- i + 1 - #if (i > 4) { - if (i > nchunks(df)) { - return(NULL) - } - print(glue::glue("{i}/{nchunks(df)}")) - return(get_chunk(df, is[i])) - #return(get_chunk(df, i)) - } - } -} - -acqall_dev1 = acqall_dev %>% delayed(~{ - .x[,oltv_band := cut(oltv, c(-Inf, seq(40,100,by=20)))] - .x[,dti_band := addNA(cut(dti, c(-Inf, seq(0,64,by=8))))] - .x -}) - - -build_model <- function() { - model <- keras_model_sequential() %>% - layer_dense(units = 2, activation = 'softmax') - - - model %>% compile( - loss = "categorical_crossentropy", - optimizer = 'sgd' - ) - - model -} - -model = build_model() - -ol = levels(acqall_dev1[,oltv_band, keep=c("oltv","dti")]) -dl = levels(acqall_dev1[,dti_band, keep=c("oltv","dti")]) - -#for(i in 1:nchunks(acqall_dev1)) { -kk <- function() { - j = 0 - done = F - ii = 0 - while(!done) { - j <- j + 1 - si = sample(nchunks(acqall_dev1), nchunks(acqall_dev1)*0.7) - osi = setdiff(1:nchunks(acqall_dev1), si) - - system.time(a <- map_dfr(si, ~{ - ii <- ii + 1 - i = .x - if(ii %% 20 == 0) print(glue::glue("{j}:{ii} {Sys.time()}")) - - a = get_chunk(acqall_dev1, i) - a[,.(sum(default_next_12m), .N),oltv_band] - - a1 = - cbind( - a[,keras::to_categorical(as.integer(oltv_band) - 1)]#, - #a[,keras::to_categorical(as.integer(dti_band) - 1)] - ) - - at = a[,default_next_12m*1] - Y_train = keras::to_categorical(at) - - hist = model %>% fit( - a1, - Y_train, - epochs = 1, - validation_split = 0.2, - verbose = 0 - ) - - gw = get_weights(model) - - gwdt = as.data.table(gw[1]) - setnames(gwdt, names(gwdt), c("non_default", "default")) - gwdt$i =i - gwdt$var = c( - rep("oltv_band", length(ol)) - #, rep("dti_band", length(dl)+1) - ) - gwdt$band = c(ol#,dl - ) - gwdt - })) - - some_chunks <- map(osi, ~{ - i = .x - a = get_chunk(acqall_dev1, i) - - a1 = - cbind( - a[,keras::to_categorical(as.integer(oltv_band) -1)] - #,a[,keras::to_categorical(dti_band %>% as.integer-1)] - ) - a1 - }) %>% reduce(rbind) - - outcomes <- map(osi, ~{ - i = .x - a = get_chunk(acqall_dev, i, keep=c("default_next_12m")) - a[,default_next_12m] - }) %>% unlist - - auc= auc(outcomes, predict(model, some_chunks)[,2]) - - # scores - p = predict(model, diag(4))[,2] - a_b = log(p/(1-p)) - - scalar = 20/log(2) - intercept = 20*log(536870912/15)/log(2) - - a = mean(a_b) - b = a_b - a - done <- length(unique(sign(diff(b)))) == 1 - - scrs = data.table(base_scr = round(a*scalar+intercept,0), scr = round(-b*scalar,0)) - scrs$var = c(rep("oltv_band", length(ol)) - #, rep("dti_band", length(dl)+1) - ) - scrs$band = c(ol - #, dl - ) - scrs = scrs[band != "bias"] - - scrs = scrs[,.(var, band, scr, base_scr)] - print(glue::glue("AUC: {auc}")) - print(scrs) - } -} - -pt = proc.time() -kk() -timetaken(pt) -View(scrs) diff --git a/tutorials/fannie_mae_10pct/00_setup.r b/tutorials/fannie_mae_10pct/00_setup.r deleted file mode 100644 index 23df171..0000000 --- a/tutorials/fannie_mae_10pct/00_setup.r +++ /dev/null @@ -1,44 +0,0 @@ -library(glue) -library(purrr) -library(fst) -library(tidyr) -library(ggplot2) -library(stringr) -library(xgboost) -library(lubridate) -library(disk.frame) - -auc = disk.frame:::auc -add_var_to_scorecard = disk.frame:::add_var_to_scorecard - -raw_perf_data_path = "C:/data/Performance_All/" -#raw_perf_data_path = "d:/data/Performance_All" - -# where the outputs go -outpath = "c:/data/fannie_mae_disk_frame/" -#outpath = "d:/data/fannie_mae_disk_frame_small/" - -Performance_ColClasses = - c("character", "character", "character", "numeric", "numeric", "numeric", "numeric", - "numeric", "character", "character", "character", "character", "character", "character", - "character", "character", "character", "numeric", "numeric", "numeric", "numeric", "numeric", - "numeric", "numeric", "numeric", "numeric", "numeric", "numeric", "character", "numeric", - "character") -Performance_Variables = - c("LOAN_ID", "Monthly.Rpt.Prd", "Servicer.Name", "LAST_RT", "LAST_UPB", "Loan.Age", - "Months.To.Legal.Mat" , "Adj.Month.To.Mat", "Maturity.Date", "MSA", "Delq.Status", - "MOD_FLAG", "Zero.Bal.Code", "ZB_DTE", "LPI_DTE", "FCC_DTE","DISP_DT", "FCC_COST", - "PP_COST", "AR_COST", "IE_COST", "TAX_COST", "NS_PROCS", "CE_PROCS", "RMW_PROCS", - "O_PROCS", "NON_INT_UPB", "PRIN_FORG_UPB_FHFA", "REPCH_FLAG", "PRIN_FORG_UPB_OTH", - "TRANSFER_FLG") %>% tolower - -dfiles = dir(raw_perf_data_path, full.names = T) -short_dfiles = dir(raw_perf_data_path) - -Acquisitions_Variables = c("LOAN_ID", "ORIG_CHN", "Seller.Name", "ORIG_RT", "ORIG_AMT", "ORIG_TRM", "ORIG_DTE" - ,"FRST_DTE", "OLTV", "OCLTV", "NUM_BO", "DTI", "CSCORE_B", "FTHB_FLG", "PURPOSE", "PROP_TYP" - ,"NUM_UNIT", "OCC_STAT", "STATE", "ZIP_3", "MI_PCT", "Product.Type", "CSCORE_C", "MI_TYPE", "RELOCATION_FLG") %>% tolower() - -Acquisition_ColClasses = c("character", "character", "character", "numeric", "numeric", "integer", "character", "character", "numeric", - "numeric", "character", "numeric", "numeric", "character", "character", "character", "character", "character", - "character", "character", "numeric", "character", "numeric", "numeric", "character") diff --git a/tutorials/fannie_mae_10pct/01a_read_from_csv.r b/tutorials/fannie_mae_10pct/01a_read_from_csv.r deleted file mode 100644 index abfe306..0000000 --- a/tutorials/fannie_mae_10pct/01a_read_from_csv.r +++ /dev/null @@ -1,47 +0,0 @@ -source("inst/fannie_mae_10pct/00_setup.r") - -# number of rows to read in from each file in one go -nreadin = 1e7 - -# compression ratio, max = 100 for best compression but slower running speed -#compress = 50 - -# set up some variable for future use -relative_file_path = dir(raw_perf_data_path) -full_file_path = dir(raw_perf_data_path, full.names = T) - -file_sizes = purrr::map_dbl(full_file_path, ~file.size(.x)) - -# use the recommend_nchunks function to get a chunksize based on your RAM and -# number of CPU cores -nchunks = sum(file_sizes) %>% recommend_nchunks(type="csv") - -# order the order of conversion by prioritising the largest files first. Because -# handling the largest files are the most difficult, and if an error occurs it -# is more likely to occur when converting large files, hence this will allow us -# to identify errors early -relative_file_path = relative_file_path[order(file_sizes, decreasing = T)] -full_file_path = full_file_path[order(file_sizes, decreasing = T)] - -l = length(full_file_path) - -# convert CSV in parallel -pt <- proc.time() -res = future.apply::future_mapply(function(relative_file_pathi, full_file_pathi) { -#res = mapply(function(relative_file_pathi, full_file_pathi) { - relative_file_pathi # for glue - df = csv_to_disk.frame( - full_file_pathi, - file.path(outpath, glue("raw_fannie_mae/{relative_file_pathi}")), - shardby = "loan_id", - nchunks = nchunks, - colClasses = Performance_ColClasses, - col.names = Performance_Variables, - sep = "|", - #compress = compress, - in_chunk_size = nreadin, - overwrite = T) - - df -}, relative_file_path, full_file_path, SIMPLIFY = F) -print(timetaken(pt)) diff --git a/tutorials/fannie_mae_10pct/01d_a_rbind_all_data_together.r b/tutorials/fannie_mae_10pct/01d_a_rbind_all_data_together.r deleted file mode 100644 index 78ccf3e..0000000 --- a/tutorials/fannie_mae_10pct/01d_a_rbind_all_data_together.r +++ /dev/null @@ -1,31 +0,0 @@ -source("inst/fannie_mae_10pct/00_setup.r") - -disk.frame_folders = dir(file.path(outpath, "raw_fannie_mae"),full.names = T) -list_of_disk.frames <- purrr::map(disk.frame_folders, disk.frame) - -pt <- proc.time() -fmdf = rbindlist.disk.frame( - list_of_disk.frames, - outdir = file.path(outpath, "fm.df"), - by_chunk_id = T, - overwrite = T) -print(timetaken(pt)) - -nrow(fmdf) - -nchunks(fmdf) - -fmdf %>% delayed(~.x[,.(sum(is.na(prin_forg_upb_oth )))] -) %>% dput - - -if(F) { - #239.44 - system.time(uid <- fmdf %>% - srckeep("loan_id") %>% - summarise(uid = unique(loan_id)) %>% - collect) - # the sharding is correct - - n_distinct(uid$uid) -} diff --git a/tutorials/fannie_mae_10pct/01d_b_OPTIONAL_rechunk.r b/tutorials/fannie_mae_10pct/01d_b_OPTIONAL_rechunk.r deleted file mode 100644 index 0a0739f..0000000 --- a/tutorials/fannie_mae_10pct/01d_b_OPTIONAL_rechunk.r +++ /dev/null @@ -1,16 +0,0 @@ -library(disk.frame) - -fmdf <- disk.frame(file.path(outpath, "fm.df")) - -# rechunk respects the shardkey -system.time(rechunk(fmdf, nchunks(fmdf)*2)) - -nrow(fmdf) - -nchunks(fmdf) - - -if(F) { - system.time(rechunk(fmdf, 6)) -} - diff --git a/tutorials/fannie_mae_10pct/02a_create_forward_looking_flag.r b/tutorials/fannie_mae_10pct/02a_create_forward_looking_flag.r deleted file mode 100644 index 041b333..0000000 --- a/tutorials/fannie_mae_10pct/02a_create_forward_looking_flag.r +++ /dev/null @@ -1,65 +0,0 @@ -# 2_exploratory.r -source("inst/fannie_mae_10pct/00_setup.r") - -fmdf = disk.frame(file.path(outpath, "fm.df")) - -pt = proc.time() -eg = fmdf %>% - srckeep(c("loan_id", "delq.status")) %>% - group_by(loan_id) %>% - summarise(n = n(), max.delq.status = max(as.integer(delq.status))) %>% - filter(n >= 12, max.delq.status == 3) %>% - delayed(~.x[1,]) %>% - collect -timetaken(pt) - -pt = proc.time() -fmdf_eg = fmdf %>% - srckeep(c("monthly.rpt.prd", "loan_id", "delq.status")) %>% - inner_join(eg, by = "loan_id") %>% - collect %>% - mutate(monthly.rpt.prd = as.Date(monthly.rpt.prd, "%m/%d/%Y")) %>% - arrange(loan_id, monthly.rpt.prd) -timetaken(pt) - -View(fmdf_eg) # see 2nd account - -# no need to hard group by it's already sharded by loan_id -pt <- proc.time() -defaults <- map.disk.frame(fmdf, function(df) { - # create the default flag - df[,date:=as.Date(monthly.rpt.prd,"%m/%d/%Y")] - setkey(df,loan_id, date) - df[,delq.status := as.integer(delq.status)] - df[,default:=delq.status>=3] - df[is.na(default),default:=F] - - df2 = df[default==T,.(loan_id, start_date =date, end_date = date)] - lubridate::month(df2$start_date) <- lubridate::month(df2$start_date)-12 - - # get rid of overlaps - df2[order(start_date),lag_end_date := shift(end_date,1), loan_id] - - df2[,grp_incr := 1] - df2[start_date <= lag_end_date, grp_incr := 0, loan_id] - - df2[,grp:=cumsum(grp_incr)] - df3 = df2[,.(start_date = min(start_date), end_date = max(end_date), default_12m=T),.(loan_id,grp)] - - df3[,grp:=NULL] - - # create the hardship flag - # whether the customer goes into in the next 12 months hardship - df3 - }, - keep = c("monthly.rpt.prd", "delq.status", "loan_id"), - outdir = file.path(outpath, "defaults.df"), - lazy = F, - overwrite = T) -cat(glue::glue("Creating forward looking default flag took {timetaken(pt)}")) - -head(defaults) - -defaults %>% - filter(loan_id == "107150255179") %>% - collect diff --git a/tutorials/fannie_mae_10pct/02a_create_forward_looking_flag_4credit_risk_forum.r b/tutorials/fannie_mae_10pct/02a_create_forward_looking_flag_4credit_risk_forum.r deleted file mode 100644 index e5320e1..0000000 --- a/tutorials/fannie_mae_10pct/02a_create_forward_looking_flag_4credit_risk_forum.r +++ /dev/null @@ -1,47 +0,0 @@ -# 2_exploratory.r -source("inst/fannie_mae/0_setup.r") -outpath -fmdf = disk.frame(file.path(outpath, "fm_with_harp")) - -nrow(fmdf) - - -# no need to hard group by it's already sharded by loan_id -pt <- proc.time() -defaults <- map.disk.frame(fmdf, function(df) { - # create the default flag - df[,date:=as.Date(monthly.rpt.prd,"%m/%d/%Y")] - setkey(df,loan_id, date) - df[,delq.status := as.integer(delq.status)] - df[,default:=delq.status>=3] - df[is.na(default),default:=F] - - df2 = df[default==T,.(loan_id, start_date =date, end_date = date)] - lubridate::month(df2$start_date) <- lubridate::month(df2$start_date)-12 - - # get rid of overlaps - df2[order(start_date),lag_end_date := shift(end_date,1), loan_id] - - df2[,grp_incr := 1] - df2[start_date <= lag_end_date, grp_incr := 0, loan_id] - - df2[,grp:=cumsum(grp_incr)] - df3 = df2[,.(start_date = min(start_date), end_date = max(end_date), default_12m=T),.(loan_id,grp)] - - df3[,grp:=NULL] - - # create the hardship flag - # whether the customer goes into in the next 12 months hardship - df3 - }, - keep = c("monthly.rpt.prd", "delq.status", "loan_id"), - outdir = file.path(outpath, "defaults.df"), - lazy = F, - overwrite = T) -cat(glue::glue("Creating forward looking default flag took {timetaken(pt)}")) - -head(defaults) - -defaults %>% - filter(loan_id == "107150255179") %>% - collect diff --git a/tutorials/fannie_mae_10pct/02b_foverlaps.r b/tutorials/fannie_mae_10pct/02b_foverlaps.r deleted file mode 100644 index d89094d..0000000 --- a/tutorials/fannie_mae_10pct/02b_foverlaps.r +++ /dev/null @@ -1,29 +0,0 @@ -# 2_exploratory.r -source("inst/fannie_mae_10pct/00_setup.r") - -fmdf = disk.frame(file.path(outpath, "fm.df")) -tmp2 = disk.frame(file.path(outpath, "defaults.df")) - -fmdf_lazy = delayed(fmdf, function(df) { - df[,monthly.rpt.prd:=as.Date(monthly.rpt.prd,"%m/%d/%Y")] - df[,monthly.rpt.prd2:=monthly.rpt.prd] -}) - -tmp2_lazy = delayed(tmp2, function(df) { - setkey(df, "loan_id","start_date","end_date") -}) - -# took 513 seconds -pt <- proc.time() -tmp3 <- foverlaps.disk.frame( - fmdf_lazy, - tmp2_lazy, - outdir=file.path(outpath, "fm_with_harp"), - by.x = c("loan_id", "monthly.rpt.prd", "monthly.rpt.prd2"), - by.y = c("loan_id", "start_date", "end_date"), - merge_by_chunk_id = T, - overwrite = T -) -cat(glue::glue("time taken to merge on default flag {timetaken(pt)}")) - -head(tmp3) diff --git a/tutorials/fannie_mae_10pct/02c_plot_odr.r b/tutorials/fannie_mae_10pct/02c_plot_odr.r deleted file mode 100644 index fc8308a..0000000 --- a/tutorials/fannie_mae_10pct/02c_plot_odr.r +++ /dev/null @@ -1,34 +0,0 @@ -# 2_exploratory.r -source("inst/fannie_mae/0_setup.r") - -fm_with_harp = disk.frame(file.path(outpath, "fm_with_harp")) - -head(fm_with_harp) - -nrow(fm_with_harp) - -# need a two stage summary -system.time(a_wh1 <- fm_with_harp %>% - srckeep(c("default_12m","monthly.rpt.prd")) %>% - group_by(monthly.rpt.prd) %>% - summarise( - N = n(), - n_defaults = sum(default_12m, na.rm = T)) %>% - collect(parallel = T) %>% - group_by(monthly.rpt.prd) %>% - summarise( - odr = sum(n_defaults)/sum(N) - ) %>% - rename( - Date = monthly.rpt.prd, - `Observed Default Rate%` = odr - )) - - -a_wh2 = a_wh1 %>% gather(key = type, value=rate, -Date) - -ggplot(a_wh2) + - geom_line(aes(x=Date, y = rate, colour = type)) + - ggtitle("Fannie Mae Observed Default Rate over time & HARP Conversion Rate") - - diff --git a/tutorials/fannie_mae_10pct/10_a_one_var.R b/tutorials/fannie_mae_10pct/10_a_one_var.R deleted file mode 100644 index e5b3cf0..0000000 --- a/tutorials/fannie_mae_10pct/10_a_one_var.R +++ /dev/null @@ -1,55 +0,0 @@ -source("inst/fannie_mae_10pct/00_setup.r") -library(disk.frame) - -acqall1 = disk.frame(file.path(outpath, "appl_mdl_data")) - -# XGBoost on one var ------------------------------------------------------ -system.time(xy <- acqall1[,c("default_next_12m", "oltv"), keep=c("default_next_12m", "oltv")]) - -# transform into format accepted by XGBoost -dtrain <- xgb.DMatrix(label = xy$default_next_12m, data = as.matrix(xy[,"oltv"])) - -portfolio_default_rate = xy[,sum(default_next_12m, na.rm = T)/.N] - - -# xgboost fit ------------------------------------------------------------- - - -################################################### -# the xgboost fit -################################################### -pt = proc.time() -m <- xgboost( - data=dtrain, - nrounds = 1, - objective = "binary:logitraw", - tree_method="exact", - monotone_constraints = 1, - base_score = portfolio_default_rate) -timetaken(pt) - -# plotting ---------------------------------------------------------------- - - -################################################### -# plotting -################################################### -prev_pred = predict(m, dtrain) - -map_chr(xgb.dump(m), ~str_extract(.x,"\\[f0<[\\d]+\\.[\\d]+\\]")) %>% - keep(~!is.na(.x)) %>% - map_dbl(~str_extract(.x, "[\\d]+\\.[\\d]+") %>% as.numeric) %>% - sort %>% - floor -> bins - -bb = xy[,.(ndef = sum(default_next_12m), .N, m1 = min(oltv), m2 = max(oltv)), .(bins = cut(oltv,c(-Inf,bins,Inf)))] -new_bins = sort(unique( bb$m2)) -bb = xy[,.(ndef = sum(default_next_12m), .N), .(bins = cut(oltv,c(-Inf,new_bins,Inf)))][order(bins)] - -setkey(bb, bins) -bb %>% - filter(!is.na(bins)) %>% - mutate(`Orig LTV Band` = bins, `Default Rate%` = ndef/N) %>% - ggplot + - geom_bar(aes(x = `Orig LTV Band`, y = `Default Rate%`), stat = 'identity') + - coord_flip() diff --git a/tutorials/fannie_mae_10pct/10_a_one_var_with_fn.R b/tutorials/fannie_mae_10pct/10_a_one_var_with_fn.R deleted file mode 100644 index 266fb10..0000000 --- a/tutorials/fannie_mae_10pct/10_a_one_var_with_fn.R +++ /dev/null @@ -1,56 +0,0 @@ -source("inst/fannie_mae/0_setup.r") -library(disk.frame) - -acqall1 = disk.frame(file.path(outpath, "appl_mdl_data_sampled_dev2")) - -pt <- proc.time() -firstm <- disk.frame:::add_var_to_scorecard(acqall1, "default_next_12m", "oltv", monotone_constraints = 1, format_fn = function(v) { - ceiling(v / 5) * 5 -}) -timetaken(pt) - -firstm - -# now add the second variable --------------------------------------------- -pt <- proc.time() -secondm <- disk.frame:::add_var_to_scorecard( - acqall1, - "default_next_12m", - "dti", - prev_pred = firstm$prev_pred, - monotone_constraints = 1) -timetaken(pt) - -pt <- proc.time() -thirdm <- disk.frame:::add_var_to_scorecard( - acqall1, - "default_next_12m", - "orig_trm", - prev_pred = secondm$prev_pred, - monotone_constraints = 1) -timetaken(pt) - -pt <- proc.time() -fourthm <- disk.frame:::add_var_to_scorecard( - acqall1, - "default_next_12m", - "num_bo", - prev_pred = thirdm$prev_pred, - monotone_constraints = -1, - format_fn = as.numeric) -timetaken(pt) - -pt <- proc.time() -fifthm <- disk.frame:::add_var_to_scorecard( - acqall1, - "default_next_12m", - "cscore_b", - prev_pred = fourthm$prev_pred, - monotone_constraints = -1) -timetaken(pt) - - -# plot improvement -------------------------------------------------------- -plot(map_dbl(list(firstm, secondm, thirdm, fourthm, fifthm), ~.x$auc), type="b", ylim = c(0.4,0.9)) -abline(h = 0.7, lty = 2) - diff --git a/tutorials/fannie_mae_10pct/10_b_rounded.R b/tutorials/fannie_mae_10pct/10_b_rounded.R deleted file mode 100644 index 73c1893..0000000 --- a/tutorials/fannie_mae_10pct/10_b_rounded.R +++ /dev/null @@ -1,123 +0,0 @@ -source("inst/fannie_mae_10pct/0_setup.r") -# what if we fit them in rounded number --------------------------------------------------------- - -xy[oltv > 80, oltv_round := ceiling(oltv/5)*5] -xy[oltv <= 80, oltv_round := ceiling(oltv/10)*10] -xy[oltv <= 40, oltv_round := ceiling(oltv/20)*20] -#xy = xy[!is.na(oltv_round),] - -xy[is.na(oltv_round),] - -xy[,.N, oltv_round] - -# transform into format accepted by XGBoost -dtrain <- xgb.DMatrix(label = xy$default_next_12m, data = as.matrix(xy[,"oltv_round"])) - -pt = proc.time() -m <- xgboost( - data=dtrain, - nrounds = 1, - objective = "binary:logitraw", - tree_method="exact", - monotone_constraints = 1, - base_score = portfolio_default_rate) -timetaken(pt) - -map_chr(xgb.dump(m), ~str_extract(.x,"\\[f0<[\\d]+[\\.]{0,1}[\\d]+\\]")) %>% - keep(~!is.na(.x)) %>% - map_dbl(~str_extract(.x, "[\\d]+[\\.]{0,1}[\\d]+") %>% as.numeric) %>% - sort -> bins - -bb = xy[,.(ndef = sum(default_next_12m), .N, m1 = min(oltv_round), m2 = max(oltv_round)), .(bins = cut(oltv_round,c(-Inf,bins,Inf)))] -new_bins = sort(unique( bb$m2)) -bb = xy[,.(ndef = sum(default_next_12m), .N), .(bins = cut(oltv_round,c(-Inf,new_bins,Inf)))][order(bins)] -bb[,odr := ndef/N] - -setkey(bb, bins) -bb %>% - filter(!is.na(bins)) %>% - mutate(`Orig LTV Band` = bins, `Default Rate%` = ndef/N) %>% - ggplot + - geom_bar(aes(x = `Orig LTV Band`, y = `Default Rate%`), stat = 'identity') + - coord_flip() - - -prev_pred = predict(m, dtrain) - -prev_pred1 = predict(m, dtrain, predcontrib=T) - -# add another variable ---------------------------------------------------- -target = "default_next_12m" -feature = "orig_amt" -df = acqall1 -format_fn = base::I -existing_model = prev_pred -monotone_constraints = -1 - -# auc <- function(target, score) { -# df = data.table(target, score) -# -# df1 = df[,.(nt = sum(target), n = .N, score)] -# setkey(df1, score) -# } -# -# add_var_to_scorecard <- function(df, target, feature, monotone_constraints = 0, prev_pred = NULL, format_fn = base::I) { -# -# xy = df %>% -# srckeep(c(target, feature)) %>% -# collect(parallel = T) -# -# # evaluate -# code = glue::glue("xy = xy %>% mutate({feature} = format_fn({feature}))") -# eval(parse(text = code)) -# -# dtrain <- xgb.DMatrix(label = xy[,target, with = F][[1]], data = as.matrix(xy[,c(feature), with = F])) -# -# if(is.null(prev_pred)) { -# pt = proc.time() -# m2 <- xgboost( -# data=dtrain, -# nrounds = 1, -# objective = "binary:logitraw", -# tree_method="exact", -# monotone_constraints = monotone_constraints -# ) -# timetaken(pt) -# } else { -# setinfo(dtrain, "base_margin", prev_pred) -# pt = proc.time() -# m2 <- xgboost( -# data=dtrain, -# nrounds = 1, -# objective = "binary:logitraw", -# tree_method="exact", -# monotone_constraints = monotone_constraints -# ) -# timetaken(pt) -# -# a2 = predict(m2, dtrain) -# a3 = predict(m2, dtrain, predcontrib = T) -# } -# -# map_chr(xgb.dump(m2), ~str_extract(.x,"\\[f0<[\\d]+[\\.]{0,1}[\\d]+\\]")) %>% -# keep(~!is.na(.x)) %>% -# map_dbl(~str_extract(.x, "[\\d]+[\\.]{0,1}[\\d]+") %>% as.numeric) %>% -# sort -> bins -# -# code = glue::glue("bb = xy[,.(ndef = sum({target}), .N, m1 = min({feature}), m2 = max({feature})), .(bins = cut({feature},c(-Inf,bins,Inf)))]") -# eval(parse(text = code)) -# -# new_bins = sort(unique(bb$m2)) -# code1 = glue::glue("bb = xy[,.(ndef = sum(default_next_12m), .N), .(bins = cut({feature},c(-Inf,new_bins,Inf)))][order(bins)]") -# eval(parse(text = code1)) -# -# setkey(bb, bins) -# bb %>% -# filter(!is.na(bins)) %>% -# mutate(`Orig LTV Band` = bins, `Default Rate%` = ndef/N) %>% -# ggplot + -# geom_bar(aes(x = `Orig LTV Band`, y = `Default Rate%`), stat = 'identity') + -# coord_flip() -# -# -# } \ No newline at end of file diff --git a/tutorials/fannie_mae_10pct/10_b_two_vars.r b/tutorials/fannie_mae_10pct/10_b_two_vars.r deleted file mode 100644 index d0df6cd..0000000 --- a/tutorials/fannie_mae_10pct/10_b_two_vars.r +++ /dev/null @@ -1,61 +0,0 @@ -source("inst/fannie_mae/0_setup.r") -library(disk.frame) - -acqall1 = disk.frame(file.path(outpath, "appl_mdl_data_sampled_dev2")) - -pt <- proc.time() -firstm <- add_var_to_scorecard( - acqall1, - "default_next_12m", - "oltv", - monotone_constraints = 1, - format_fn = function(v) { - ceiling(v / 5) * 5 - }) -timetaken(pt) - -firstm - -# now add the second variable --------------------------------------------- -pt <- proc.time() -secondm <- add_var_to_scorecard( - acqall1, - "default_next_12m", - "dti", - prev_pred = firstm$prev_pred, - monotone_constraints = 1) -timetaken(pt) - -pt <- proc.time() -thirdm <- add_var_to_scorecard( - acqall1, - "default_next_12m", - "orig_trm", - prev_pred = secondm$prev_pred, - monotone_constraints = 1) -timetaken(pt) - -pt <- proc.time() -fourthm <- add_var_to_scorecard( - acqall1, - "default_next_12m", - "num_bo", - prev_pred = thirdm$prev_pred, - monotone_constraints = -1, - format_fn = as.numeric) -timetaken(pt) - -pt <- proc.time() -fifthm <- add_var_to_scorecard( - acqall1, - "default_next_12m", - "cscore_b", - prev_pred = fourthm$prev_pred, - monotone_constraints = -1) -timetaken(pt) - - -# plot improvement -------------------------------------------------------- -#plot(map_dbl(list(firstm, secondm, thirdm, fourthm, fifthm), ~.x$auc), type="b", ylim = c(0.4,0.9)) -#abline(h = 0.7, lty = 2) - diff --git a/tutorials/fannie_mae_10pct/10_c_two_vars.R b/tutorials/fannie_mae_10pct/10_c_two_vars.R deleted file mode 100644 index 3124d57..0000000 --- a/tutorials/fannie_mae_10pct/10_c_two_vars.R +++ /dev/null @@ -1,64 +0,0 @@ -source("inst/fannie_mae_10pct/0_setup.r") -library(disk.frame) - -acqall1 = disk.frame(file.path(outpath, "appl_mdl_data")) - -pt <- proc.time() -firstm <- add_var_to_scorecard( - acqall1, - "default_next_12m", - "oltv", - monotone_constraints = 1, - format_fn = function(v) { - ceiling(v / 5) * 5 - }) -timetaken(pt) - -firstm - -# now add the second variable --------------------------------------------- -pt <- proc.time() -secondm <- add_var_to_scorecard( - acqall1, - "default_next_12m", - "dti", - prev_pred = firstm$prev_pred, - monotone_constraints = 1) -timetaken(pt) -secondm - -pt <- proc.time() -thirdm <- add_var_to_scorecard( - acqall1, - "default_next_12m", - "orig_trm", - prev_pred = secondm$prev_pred, - monotone_constraints = 1) -timetaken(pt) - -pt <- proc.time() -fourthm <- add_var_to_scorecard( - acqall1, - "default_next_12m", - "num_bo", - prev_pred = thirdm$prev_pred, - monotone_constraints = -1, - format_fn = as.numeric) -timetaken(pt) - -pt <- proc.time() -fifthm <- add_var_to_scorecard( - acqall1, - "default_next_12m", - "cscore_b", - prev_pred = fourthm$prev_pred, - monotone_constraints = -1) -timetaken(pt) - - -# plot improvement -------------------------------------------------------- -plot( - map_dbl( - list(firstm, secondm, thirdm, fourthm, fifthm), ~.x$auc), type="b", ylim = c(0.4,0.9), main="AUC vs number of variables",ylab="AUC",xlab="Number of Variables") -abline(h = 0.7, lty = 2) - diff --git a/tutorials/fannie_mae_10pct/10_e_AutoML_XGBoost_Scorecards.r b/tutorials/fannie_mae_10pct/10_e_AutoML_XGBoost_Scorecards.r deleted file mode 100644 index 66eb27b..0000000 --- a/tutorials/fannie_mae_10pct/10_e_AutoML_XGBoost_Scorecards.r +++ /dev/null @@ -1,91 +0,0 @@ -source("inst/fannie_mae_10pct/00_setup.r") -library(disk.frame) -library(xgboost) -acqall_dev = disk.frame(file.path(outpath, "appl_mdl_data_sampled")) - -add_var_to_scorecard = disk.frame:::add_var_to_scorecard - -# 5d develop a function to test all variables -check_which_is_best <- function(df, target, features, monotone_constraints, format_fns, weight=NULL) { - - prev_pred = NULL - ws = NULL - vars_scr = NULL - - while(length(features) > 0) { - # try every var - - res = furrr::future_map(1:length(features), ~add_var_to_scorecard( - #res = purrr::map(1:length(features), ~add_var_to_scorecard( - df, - target, - features[.x], - monotone_constraints = monotone_constraints[.x], - prev_pred = prev_pred, - format_fn = format_fns[[.x]], - weight, - save_model_fname = glue::glue("{features[.x]}.xgbm") - )) - - # which has the best auc - w = which.max(map_dbl(res, ~.x$auc)) - ws = c(ws, w) - feature_to_fit = features[w] - print(glue::glue("choosen: {feature_to_fit}")) - - var_scr1 = res[[w]] - eval(parse(text = glue::glue("vars_scr = c(vars_scr, list({feature_to_fit} = var_scr1))"))) - - features = features[-w] - format_fns = format_fns[-w] - monotone_constraints = monotone_constraints[-w] - - prev_pred = var_scr1$prev_pred - } - vars_scr -} - -num_vars = c("mi_pct", "orig_amt", "orig_rt", "ocltv", "mi_type", "cscore_c", "oltv", "dti", "orig_trm", "cscore_b", "num_bo", "num_unit", "orig_dte", "frst_dte") -num_vars_mon = c(0 , 1 , 1 , 1 , 0 , -1 , 1 , 1 , 1 , -1 , -1 , 0 , 0 , 0) -num_var_fmt_fn = c(map(1:10, ~base::I), map(1:2, ~as.numeric), map(1:2, ~function(x) { - as.numeric(substr(x, 4, 7)) -})) - -cat_vars = setdiff(names(acqall_dev), num_vars) %>% - setdiff( - c("loan_id", "default_next_12m", "first_default_date", "mths_to_1st_default", "zip_3", "weight")) - -pt = proc.time() -res_all = check_which_is_best( - acqall_dev, - "default_next_12m", - features = c(num_vars, cat_vars), - monotone_constraints = c(num_vars_mon, rep(0, length(cat_vars))), - format_fns = c(num_var_fmt_fn, map(1:length(cat_vars), ~base::I)), - weight = "weight" -) -timetaken(pt) - -plot(map_dbl(res_all, ~.x$auc)) - -#saveRDS(res_all, "model.rds") - -if(F) { - rescat = check_which_is_best( - acqall_dev, - "default_next_12m", - c("seller.name", "oltv", "state", "dti", "mi_type"), - c(0 , 1 , 0 , 1 , 0 ), - map(1:5, ~base::I) - ) - - - system.time(res <- check_which_is_best( - acqall1, - "default_next_12m", - c("ocltv", "cscore_c", "mi_type", "oltv", "dti", "orig_trm", "cscore_b", "num_bo"), - c(1 , -1 , 0 , 1 , 1 , 1 , -1 , -1 ), - c(map(1:7, ~base::I), list(as.numeric)))) -} - - diff --git a/tutorials/fannie_mae_10pct/10_f_make_scorecard_visible.R b/tutorials/fannie_mae_10pct/10_f_make_scorecard_visible.R deleted file mode 100644 index 36fb832..0000000 --- a/tutorials/fannie_mae_10pct/10_f_make_scorecard_visible.R +++ /dev/null @@ -1,109 +0,0 @@ -source("inst/fannie_mae/0_setup.r") -library(disk.frame) - -acqall_val = disk.frame(file.path(outpath, "appl_mdl_data_sampled_val2")) -df = acqall_val -#the model -mdl = readRDS("model.rds") - -plot(purrr::map_dbl(mdl, ~.x$auc)) - -#' score one variable -score_one_var <- function(feature, x, bins, bias, format_fn = base::I) { - print(feature) - # numeric features (post format_fn) has only two columns in bins - if(ncol(bins) == 2) { - x1 = format_fn(x) - binsNA = evalparseglue("bins[is.na({feature}),]") - bins = evalparseglue("bins[!is.na({feature}),]") - score = bins$score[cut(x1, c(-Inf, bins[[2]], Inf))] - - if(nrow(binsNA) == 1) { - score[is.na(x1)] = binsNA[[1]] - } - } else { - x1 = format_fn(x) - x1df = evalparseglue("data.table({feature} = x1)") - x1df_scr = evalparseglue("merge(x1df, bins, by = '{feature}', all.x=T)") - score = x1df_scr$score - } - # if there are still na then assign the bias - #browser(expr = any(is.na(score))) - if(any(is.na(score))) { - warning(glue::glue("the feature `{feature}` has {sum(is.na(score))} unassigned scores out of {length(score)}, assigning them the bias = {bias}")) - score[is.na(score)] = bias - } - - score -} - - -#' Socre a scorecard -#' @param mdl the xg scorecard model -#' @param df a disk.frame -score_xg_scorecard <- function(df, mdl) { - res = furrr::future_map_dfc(mdl, ~{ - #res = purrr::map_dfc(mdl, ~{ - var = df %>% - srckeep(.x$feature) %>% - collect(parallel = F) - x = var[[1]] - feature = .x$feature - bins = .x$bins - bias = .x$bias - format_fn = .x$format_fn - score_one_var(feature, x, bins, bias, format_fn) - }) %>% rowSums - res -} - -plot_auc <- function(df) { - target = df %>% - srckeep("default_next_12m") %>% - collect(parallel = F) - - pt = proc.time() - target$score = score_xg_scorecard(df, mdl) - timetaken(pt) - - AUC = auc(target$default_next_12m, target$score) - GINI = 2*AUC-1 - - target[,negscore := -score] - setkey(target, negscore) - cts = target[,.(ctot = (1:.N)/.N, cbad = cumsum(default_next_12m)/sum(default_next_12m))] - - plot(cts[seq(1,.N, length.out=100),], type="l", xlim=c(0,1), - main=glue::glue("On Validation - AUC: {AUC %>% round(2)}; GINI: {GINI %>% round(2)}")) - abline(a=0,b=1,lty=2) - abline(h=0) - abline(v=0) -} - - -system.time(plot_auc(acqall_val)) - -# score on whole ---------------------------------------------------------- -acqall_dev = disk.frame(file.path(outpath, "appl_mdl_data_sampled_dev2")) -system.time(plot_auc(acqall_dev)) - -scorecard = map_dfr(mdl, ~{ - res = .x$bins - evalparseglue("res[,feature_lbl := as.character({.x$feature})]") - res[,variable := .x$feature] - res %>% - select(variable, feature_lbl, score ) %>% - mutate(score = round(-score*20/log(2))) -}) - -saveRDS(scorecard, "scorecard.rds") -scorecard = readRDS("scorecard.rds") -View(scorecard) - -DT::datatable(scorecard) - -# score on whole ---------------------------------------------------------- -# acqall = disk.frame(file.path(outpath, "appl_mdl_data")) -# system.time(plot_auc(acqall)) - - diff --git a/tutorials/fannie_mae_10pct/10a_AutoML_XGBoost_Scorecards.r b/tutorials/fannie_mae_10pct/10a_AutoML_XGBoost_Scorecards.r deleted file mode 100644 index 6df43da..0000000 --- a/tutorials/fannie_mae_10pct/10a_AutoML_XGBoost_Scorecards.r +++ /dev/null @@ -1,90 +0,0 @@ -source("inst/fannie_mae_10pct/0_setup.r") -library(disk.frame) - -acqall_dev = disk.frame(file.path(outpath, "appl_mdl_data_sampled")) - - -# 5d develop a function to test all variables -check_which_is_best <- function(df, target, features, monotone_constraints, format_fns, weight=NULL) { - - prev_pred = NULL - ws = NULL - vars_scr = NULL - - while(length(features) > 0) { - # try every var - - res = furrr::future_map(1:length(features), ~add_var_to_scorecard( - #res = purrr::map(1:length(features), ~add_var_to_scorecard( - df, - target, - features[.x], - monotone_constraints = monotone_constraints[.x], - prev_pred = prev_pred, - format_fn = format_fns[[.x]], - weight, - save_model_fname = glue::glue("{features[.x]}.xgbm") - )) - - # which has the best auc - w = which.max(map_dbl(res, ~.x$auc)) - ws = c(ws, w) - feature_to_fit = features[w] - print(glue::glue("choosen: {feature_to_fit}")) - - var_scr1 = res[[w]] - eval(parse(text = glue::glue("vars_scr = c(vars_scr, list({feature_to_fit} = var_scr1))"))) - - features = features[-w] - format_fns = format_fns[-w] - monotone_constraints = monotone_constraints[-w] - - prev_pred = var_scr1$prev_pred - } - vars_scr -} - -num_vars = c("mi_pct", "orig_amt", "orig_rt", "ocltv", "mi_type", "cscore_c", "oltv", "dti", "orig_trm", "cscore_b", "num_bo", "num_unit", "orig_dte", "frst_dte") -num_vars_mon = c(0 , 1 , 1 , 1 , 0 , -1 , 1 , 1 , 1 , -1 , -1 , 0 , 0 , 0) -num_var_fmt_fn = c(map(1:10, ~base::I), map(1:2, ~as.numeric), map(1:2, ~function(x) { - as.numeric(substr(x, 4, 7)) -})) - -cat_vars = setdiff(names(acqall_dev), num_vars) %>% - setdiff( - c("loan_id", "default_next_12m", "first_default_date", "mths_to_1st_default", "zip_3", "weight")) - -pt = proc.time() -res_all = check_which_is_best( - acqall_dev, - "default_next_12m", - features = c(num_vars, cat_vars), - monotone_constraints = c(num_vars_mon, rep(0, length(cat_vars))), - format_fns = c(num_var_fmt_fn, map(1:length(cat_vars), ~base::I)), - weight = "weight" -) -timetaken(pt) - -plot(map_dbl(res_all, ~.x$auc)) - -#saveRDS(res_all, "model.rds") - -if(F) { - rescat = check_which_is_best( - acqall_dev, - "default_next_12m", - c("seller.name", "oltv", "state", "dti", "mi_type"), - c(0 , 1 , 0 , 1 , 0 ), - map(1:5, ~base::I) - ) - - - system.time(res <- check_which_is_best( - acqall1, - "default_next_12m", - c("ocltv", "cscore_c", "mi_type", "oltv", "dti", "orig_trm", "cscore_b", "num_bo"), - c(1 , -1 , 0 , 1 , 1 , 1 , -1 , -1 ), - c(map(1:7, ~base::I), list(as.numeric)))) -} - - diff --git a/tutorials/fannie_mae_10pct/10b_make_scorecard_visible.R b/tutorials/fannie_mae_10pct/10b_make_scorecard_visible.R deleted file mode 100644 index 36fb832..0000000 --- a/tutorials/fannie_mae_10pct/10b_make_scorecard_visible.R +++ /dev/null @@ -1,109 +0,0 @@ -source("inst/fannie_mae/0_setup.r") -library(disk.frame) - -acqall_val = disk.frame(file.path(outpath, "appl_mdl_data_sampled_val2")) -df = acqall_val -#the model -mdl = readRDS("model.rds") - -plot(purrr::map_dbl(mdl, ~.x$auc)) - -#' score one variable -score_one_var <- function(feature, x, bins, bias, format_fn = base::I) { - print(feature) - # numeric features (post format_fn) has only two columns in bins - if(ncol(bins) == 2) { - x1 = format_fn(x) - binsNA = evalparseglue("bins[is.na({feature}),]") - bins = evalparseglue("bins[!is.na({feature}),]") - score = bins$score[cut(x1, c(-Inf, bins[[2]], Inf))] - - if(nrow(binsNA) == 1) { - score[is.na(x1)] = binsNA[[1]] - } - } else { - x1 = format_fn(x) - x1df = evalparseglue("data.table({feature} = x1)") - x1df_scr = evalparseglue("merge(x1df, bins, by = '{feature}', all.x=T)") - score = x1df_scr$score - } - # if there are still na then assign the bias - #browser(expr = any(is.na(score))) - if(any(is.na(score))) { - warning(glue::glue("the feature `{feature}` has {sum(is.na(score))} unassigned scores out of {length(score)}, assigning them the bias = {bias}")) - score[is.na(score)] = bias - } - - score -} - - -#' Socre a scorecard -#' @param mdl the xg scorecard model -#' @param df a disk.frame -score_xg_scorecard <- function(df, mdl) { - res = furrr::future_map_dfc(mdl, ~{ - #res = purrr::map_dfc(mdl, ~{ - var = df %>% - srckeep(.x$feature) %>% - collect(parallel = F) - x = var[[1]] - feature = .x$feature - bins = .x$bins - bias = .x$bias - format_fn = .x$format_fn - score_one_var(feature, x, bins, bias, format_fn) - }) %>% rowSums - res -} - -plot_auc <- function(df) { - target = df %>% - srckeep("default_next_12m") %>% - collect(parallel = F) - - pt = proc.time() - target$score = score_xg_scorecard(df, mdl) - timetaken(pt) - - AUC = auc(target$default_next_12m, target$score) - GINI = 2*AUC-1 - - target[,negscore := -score] - setkey(target, negscore) - cts = target[,.(ctot = (1:.N)/.N, cbad = cumsum(default_next_12m)/sum(default_next_12m))] - - plot(cts[seq(1,.N, length.out=100),], type="l", xlim=c(0,1), - main=glue::glue("On Validation - AUC: {AUC %>% round(2)}; GINI: {GINI %>% round(2)}")) - abline(a=0,b=1,lty=2) - abline(h=0) - abline(v=0) -} - - -system.time(plot_auc(acqall_val)) - -# score on whole ---------------------------------------------------------- -acqall_dev = disk.frame(file.path(outpath, "appl_mdl_data_sampled_dev2")) -system.time(plot_auc(acqall_dev)) - -scorecard = map_dfr(mdl, ~{ - res = .x$bins - evalparseglue("res[,feature_lbl := as.character({.x$feature})]") - res[,variable := .x$feature] - res %>% - select(variable, feature_lbl, score ) %>% - mutate(score = round(-score*20/log(2))) -}) - -saveRDS(scorecard, "scorecard.rds") -scorecard = readRDS("scorecard.rds") -View(scorecard) - -DT::datatable(scorecard) - -# score on whole ---------------------------------------------------------- -# acqall = disk.frame(file.path(outpath, "appl_mdl_data")) -# system.time(plot_auc(acqall)) - - diff --git a/tutorials/fannie_mae_10pct/11_speedglm.R b/tutorials/fannie_mae_10pct/11_speedglm.R deleted file mode 100644 index 33b1323..0000000 --- a/tutorials/fannie_mae_10pct/11_speedglm.R +++ /dev/null @@ -1,50 +0,0 @@ -source("inst/fannie_mae/0_setup.r") -library(disk.frame) -library(speedglm) -library(biglm) - -acqall_dev = disk.frame(file.path(outpath, "appl_mdl_data")) -library(speedglm) - -head(acqall_dev) - -#' A streaming function for speedglm -#' @param df a disk.frame -stream_shglm <- function(df) { - i = 0 - is = sample(nchunks(df), replace = F) - function(reset = F) { - - if(reset) { - print("reset") - i <<- 0 - } else { - i <<- i + 1 - #if (i > 3) { - if (i > nchunks(df)) { - return(NULL) - } - print(glue::glue("streaming: {i}/{nchunks(df)}")) - return(get_chunk(df, is[i])) - #return(get_chunk(df, i)) - } - } -} - -acqall_dev1 = acqall_dev %>% delayed(~{ - .x[,oltv_band := cut(oltv, c(-Inf, seq(60,100,by=20)))] - .x[,dti_band := addNA(cut(dti, c(-Inf, c(20,100), Inf)))] - .x -}) - -streamacq = stream_shglm(acqall_dev1) - -m = bigglm( - default_next_12m ~ oltv_band-1, - data = streamacq, - family=binomial()) -summary(m) - -shglm(default_next_12m ~ oltv_band -1 #+ dti_band + - 1 - , - datafun = streamacq, family=binomial()) \ No newline at end of file diff --git a/tutorials/fannie_mae_10pct/12_keras.r b/tutorials/fannie_mae_10pct/12_keras.r deleted file mode 100644 index e43f201..0000000 --- a/tutorials/fannie_mae_10pct/12_keras.r +++ /dev/null @@ -1,178 +0,0 @@ -source("inst/fannie_mae_10pct/0_setup.r") -library(disk.frame) -library(keras) - -acqall_dev = disk.frame(file.path(outpath, "appl_mdl_data_sampled")) - -#' A streaming function for speedglm -#' @param df a disk.frame -stream_shglm <- function(df) { - i = 0 - is = sample(nchunks(df), replace = F) - function(reset = F) { - - if(reset) { - print("you've reset") - i <<- 0 - } else { - i <<- i + 1 - #if (i > 4) { - if (i > nchunks(df)) { - return(NULL) - } - print(glue::glue("{i}/{nchunks(df)}")) - return(get_chunk(df, is[i])) - #return(get_chunk(df, i)) - } - } -} - -acqall_dev1 = acqall_dev %>% delayed(~{ - .x[,oltv_band := cut(oltv, c(-Inf, 60,80, Inf))] - #.x[,scr_band := addNA(cut(cscore_b, c(-Inf, 627,700, Inf)), ifany=F)] - .x[,scr_band := addNA(cut(cscore_b, c(-Inf, 700,716,725,742,748,766,794, Inf)), ifany=F)] - - .x -}) - -if(F) { - aa = acqall_dev1 %>% collect - glm(default_next_12m ~ oltv_band + scr_band - 1, data=aa) -} - -head(acqall_dev1) - -build_model <- function() { - model <- keras_model_sequential() %>% - layer_dense(units = 2, activation = 'softmax') - - - model %>% compile( - loss = "categorical_crossentropy", - optimizer = 'sgd' - ) - - model -} - -model = build_model() - -#ol = levels(acqall_dev1[,oltv_band, keep=c("oltv","cscore_b")]) -#dl = levels(acqall_dev1[,scr_band, keep=c("oltv","cscore_b")]) - -ol = levels(get_chunk(acqall_dev1,1)[,oltv_band]) -dl = levels(get_chunk(acqall_dev1,1)[,scr_band]) - - -#for(i in 1:nchunks(acqall_dev1)) { -kk <- function() { - - j = 0 - done = F - ii = 0 - while(!done) { - j <- j + 1 - si = sample(nchunks(acqall_dev1), nchunks(acqall_dev1)*0.3) - osi = setdiff(1:nchunks(acqall_dev1), si) - - system.time(a <- map_dfr(si, ~{ - - ii <- ii + 1 - i = .x - if(ii %% 20 == 0) print(glue::glue("{j}:{ii} {Sys.time()}")) - - a = get_chunk(acqall_dev1, i) - #a[,.(sum(default_next_12m), .N),oltv_band] - - a1 = - cbind( - a[,keras::to_categorical(as.integer(oltv_band) - 1)] - #,a[,keras::to_categorical(as.integer(scr_band) - 1)] - ) - - at = a[,default_next_12m*1] - Y_train = keras::to_categorical(at) - - hist = model %>% fit( - a1, - Y_train, - epochs = 1, - validation_split = 0.2, - verbose = 0 - ) - - gw = get_weights(model) - - gwdt = as.data.table(gw[1]) - setnames(gwdt, names(gwdt), c("non_default", "default")) - gwdt$i =i - gwdt$var = c( - rep("oltv_band", length(ol)) - #, rep("scr_band", length(dl)) - ) - gwdt$band = c(ol,dl - ) - gwdt - })) - - some_chunks <- map(osi, ~{ - i = .x - a = get_chunk(acqall_dev1, i) - - a1 = - cbind( - a[,keras::to_categorical(as.integer(oltv_band) -1)] - #,a[,keras::to_categorical(as.integer(scr_band)-1)] - ) - a1 - }) %>% reduce(rbind) - - outcomes <- map(osi, ~{ - i = .x - a = get_chunk(acqall_dev, i, keep=c("default_next_12m")) - a[,default_next_12m] - }) %>% unlist - - auc = auc(outcomes, predict(model, some_chunks)[,2]) - - # scores - p = predict(model, diag( - length(ol) - #+length(dl) - ))[,2] - a_b = log(p/(1-p)) - - scalar = 20/log(2) - intercept = 20*log(536870912/15)/log(2) - - a = mean(a_b) - b = a_b - a - done <- length(unique(sign(diff(b)))) == 1 - - scrs = data.table(base_scr = round(a*scalar+intercept,0), scr = round(-b*scalar,0)) - scrs$var = c( - rep("oltv_band", length(ol)) - #, rep("scr_band", length(dl)) - ) - scrs$band = c(ol - , dl - ) - scrs = scrs[band != "bias"] - - scrs = scrs[,.(var, band, scr, base_scr)] - print(glue::glue("AUC: {auc}")) - print(scrs) - } - scrs -} - -pt = proc.time() -scrs = kk() -timetaken(pt) -View(scrs) - -#AUC: 0.599790504924901 -# var band scr base_scr -# 1: oltv_band (-Inf,60] 10 351 -# 2: oltv_band (60,80] 10 351 -# 3: oltv_band (80, Inf] -21 351 \ No newline at end of file diff --git a/tutorials/fannie_mae_new/00_setup.r b/tutorials/fannie_mae_new/00_setup.r deleted file mode 100644 index 5bb675e..0000000 --- a/tutorials/fannie_mae_new/00_setup.r +++ /dev/null @@ -1,66 +0,0 @@ -library(glue) -library(purrr) -library(fst) -library(tidyr) -library(ggplot2) -library(stringr) -library(xgboost) -library(lubridate) -library(future.apply) -library(data.table) -library(disk.frame) - -setup_disk.frame() - - -raw_harp_data_path = "c:/data/fannie_mae/harp_files" -#raw_harp_data_path = "D:/data/fannie_mae/harp_files" - -#raw_perf_data_path = "c:/data/Performance_All/" - -## read the split data set is ALOT (6x) faster -raw_perf_data_path = "c:/data/Performance_All_split/" -#raw_perf_data_path = "d:/data/Performance_All/" - - -raw_harp_data_path = "c:/data/fannie_mae/harp_files" -#raw_harp_data_path = "D:/data/fannie_mae/harp_files" - -# level of compression from 1 to 100 where 100 is the highest level -# compress = 50 - -# where the outputs go -#outpath = "c:/data/fannie_mae_disk_frame/" -outpath = "c:/data/fannie_mae_disk_frame/" - -#acqzip_file_path = "c:/data/Acquisition_All.zip" -acqzip_file_path = "c:/data/fannie_mae/Acquisition_All.zip" - -#appl_path = "C:/data/" -appl_path = "c:/data" - -Performance_ColClasses = - c("character", "character", "character", "numeric", "numeric", "numeric", "numeric", - "numeric", "character", "character", "character", "character", "character", "character", - "character", "character", "character", "numeric", "numeric", "numeric", "numeric", "numeric", - "numeric", "numeric", "numeric", "numeric", "numeric", "numeric", "character", "numeric", - "character") -Performance_Variables = - c("LOAN_ID", "Monthly.Rpt.Prd", "Servicer.Name", "LAST_RT", "LAST_UPB", "Loan.Age", - "Months.To.Legal.Mat" , "Adj.Month.To.Mat", "Maturity.Date", "MSA", "Delq.Status", - "MOD_FLAG", "Zero.Bal.Code", "ZB_DTE", "LPI_DTE", "FCC_DTE","DISP_DT", "FCC_COST", - "PP_COST", "AR_COST", "IE_COST", "TAX_COST", "NS_PROCS", "CE_PROCS", "RMW_PROCS", - "O_PROCS", "NON_INT_UPB", "PRIN_FORG_UPB_FHFA", "REPCH_FLAG", "PRIN_FORG_UPB_OTH", - "TRANSFER_FLG") %>% tolower - -dfiles = dir(raw_perf_data_path, full.names = T) -short_dfiles = dir(raw_perf_data_path) - -Acquisitions_Variables = c("LOAN_ID", "ORIG_CHN", "Seller.Name", "ORIG_RT", "ORIG_AMT", "ORIG_TRM", "ORIG_DTE" - ,"FRST_DTE", "OLTV", "OCLTV", "NUM_BO", "DTI", "CSCORE_B", "FTHB_FLG", "PURPOSE", "PROP_TYP" - ,"NUM_UNIT", "OCC_STAT", "STATE", "ZIP_3", "MI_PCT", "Product.Type", "CSCORE_C", "MI_TYPE", "RELOCATION_FLG") %>% tolower() - -Acquisition_ColClasses = c("character", "character", "character", "numeric", "numeric", "integer", "character", "character", "numeric", - "numeric", "character", "numeric", "numeric", "character", "character", "character", "character", "character", - "character", "character", "numeric", "character", "numeric", "numeric", "character") - diff --git a/tutorials/fannie_mae_new/01a_read_from_csv.r b/tutorials/fannie_mae_new/01a_read_from_csv.r deleted file mode 100644 index e97b7c6..0000000 --- a/tutorials/fannie_mae_new/01a_read_from_csv.r +++ /dev/null @@ -1,40 +0,0 @@ -source("tutorials/fannie_mae/00_setup.r") - -# number of rows to read in from each file in one go -nreadin = NULL - -# your ram size in G -# this is needed as memomry.limit seems broken -ram.size = 64 -nc = parallel::detectCores(logical = FALSE) -conservatism = 2 -minchunks = nc -# compression ratio, max = 100 for best compression but slower running speed -# compress = 100 - -# set up some variable for future use - -full_file_path = dir(raw_perf_data_path, full.names = T) - - -# randomise it to maximize the chance of a good load balancing -#set.seed(1) -#full_file_path = sample(full_file_path, length(full_file_path)) - -# convert CSV in parallel -pt <- proc.time() -tot_file_size = sum(file.size(full_file_path))/1024^3 -res = csv_to_disk.frame( - full_file_path[1:6], - outdir = file.path(outpath, "raw_fannie_mae"), - #shardby = "loan_id", - colClasses = Performance_ColClasses, - col.names = Performance_Variables, - sep = "|", - in_chunk_size = nreadin, - nchunks = max(round(tot_file_size/ram.size*nc)*nc*conservatism, nc), - overwrite = TRUE, - .progress = TRUE # show progress -) -print(data.table::timetaken(pt)) - diff --git a/tutorials/fannie_mae_new/01d_OPTIONAL_rechunk.r b/tutorials/fannie_mae_new/01d_OPTIONAL_rechunk.r deleted file mode 100644 index 1ee3669..0000000 --- a/tutorials/fannie_mae_new/01d_OPTIONAL_rechunk.r +++ /dev/null @@ -1,5 +0,0 @@ -library(disk.frame) - -fmdf <- disk.frame("fmdf") - -rechunk(fmdf, 128) diff --git a/tutorials/fannie_mae_new/01e_read_data_for_harp.r b/tutorials/fannie_mae_new/01e_read_data_for_harp.r deleted file mode 100644 index b9f1173..0000000 --- a/tutorials/fannie_mae_new/01e_read_data_for_harp.r +++ /dev/null @@ -1,85 +0,0 @@ -#1c2 -source("tutorials/fannie_mae/00_setup.r") - -pt <- proc.time() - -# rows to read in one go -rows_to_read = 1e7 - -# load the Fannie Mae disk.frame -fmdf <- disk.frame(file.path(outpath, "fm.df")) - -#system.time(harp <- fread("C:/data/HARP_Files/Performance_HARP.txt", colClasses = Performance_ColClasses, col.names = Performance_Variables)) -harp_mapping <- fread(file.path(raw_harp_data_path,"Loan_Mapping.txt"), colClasses = "c", col.names = c("loan_id", "harp_id")) -setkey(harp_mapping, harp_id) -fst::write_fst(harp_mapping,"harp_mapping.fst.tmp") -fs::file_move("harp_mapping.fst.tmp", "harp_mapping.fst") -print("reading in and saving HARP mapping file took: ") -print(data.table::timetaken(pt)) - -# took about 438.65 on laptop 500 chunks -# took about 205 on desktop 56 chunks -pt <- proc.time() -system.time( - harp <- csv_to_disk.frame(file.path(raw_harp_data_path,"Performance_HARP.txt"), inmapfn = function(df) { - setnames(df, "loan_id", "harp_id") - - merge(df, harp_mapping, by="harp_id") - }, - nchunks = nchunks(fmdf), - in_chunk_size = rows_to_read, - shardby = "loan_id", - outdir = file.path(outpath, "harp.df"), - colClasses = Performance_ColClasses, - col.names = Performance_Variables, - sep="|")) - -print("reading in and saving HARP mapping file took: ") -print(data.table::timetaken(pt)) - - -if(F) { - # it can be seen that some accounts can start n harp the month the same that it ends in the dataset - harp1 = get_chunk.disk.frame(harp,1) - fmdf1 = get_chunk.disk.frame(fmdf,1) - - harp1[,date:=as.Date(monthly.rpt.prd,"%m/%d/%Y")] - fmdf1[,date:=as.Date(monthly.rpt.prd,"%m/%d/%Y")] - - fmdf2 = fmdf1[loan_id %in% unique(harp1$loan_id)] - - fmdf3 = merge( - fmdf2[,.(max_date = max(date)),loan_id], - harp1[,.(min_date = min(date)),loan_id], - by = "loan_id" - ) - - fmdf3[max_date == min_date,] - - fmdf3[max_date > min_date,] - fmdf3[max_date <= min_date,] - - harp4 = harp[,.N, delq.status, keep="delq.status"] -} - - -# print(nrow(harp)) -# system.time(harp <- merge(harp, harp_mapping, by="harp_id")) -# print(nrow(harp)) -# setkey(harp,"loan_id") -# system.time(fst::write_fst(harp,"harp.fst")) # 16.63 -#system.time(fst::write_fst(harp,"harp100.fst",100)) # 391 seconds - -# check if the harp_id -> loan_id is successful -if(F) { - harp_acq <- fread("C:/data/HARP_Files/Acquisition_HARP.txt") - fst::write_fst(harp_mapping,"harp_mapping.fst") - - harp[,date:=as.Date(month,"%m/%d/%Y")] - harp[,.N,date][order(date)] - fmdf = disk.frame("fmdf") - system.time(uid <- fmdf[,.(loan_id = unique(loan_id)), keep = "loan_id"][,unique(loan_id)]) - harp_uid = unique(harp_mapped$loan_id) - def = intersect(uid, harp_uid) - def -} \ No newline at end of file diff --git a/tutorials/fannie_mae_new/02_exploratory.r b/tutorials/fannie_mae_new/02_exploratory.r deleted file mode 100644 index 2a362d8..0000000 --- a/tutorials/fannie_mae_new/02_exploratory.r +++ /dev/null @@ -1,19 +0,0 @@ -# 2_exploratory.r -source("inst/fannie_mae/0_setup.r") - -fmdf = disk.frame("fmdf0") - -system.time(a <- fmdf[,.N,delq.status,keep="delq.status"][,.(N=sum(N)),delq.status]) -setkey(a,delq.status); a - -str(a) - -str(head(fmdf)) - -b = fmdf[,.N,monthly.rpt.prd,keep="monthly.rpt.prd"][,sum(N),monthly.rpt.prd]; b - - -names(fmdf) -servicer.name_cnt = fmdf[,.N,servicer.name,keep=c("servicer.name")] - -servicer.name_cnt[,sum(N),servicer.name] diff --git a/tutorials/fannie_mae_new/02a_create_forward_looking_flag.r b/tutorials/fannie_mae_new/02a_create_forward_looking_flag.r deleted file mode 100644 index 2d7dc6b..0000000 --- a/tutorials/fannie_mae_new/02a_create_forward_looking_flag.r +++ /dev/null @@ -1,97 +0,0 @@ -# 2_exploratory.r -source("inst/fannie_mae/0_setup.r") - -fmdf = disk.frame(file.path(outpath, "fm.df")) -harp = disk.frame(file.path(outpath, "harp.df")) - -# harp contains the first date on which a loan enters into HARP -pt <- proc.time() -harp1 <- harp %>% - srckeep(c("loan_id","monthly.rpt.prd")) %>% - group_by(loan_id, hard = F) %>% # the data is sharded by loan_id hence hard = F is fine - summarise(first_harp_date = min(as.Date(monthly.rpt.prd, "%m/%d/%Y"))) %>% - collect(parallel = T) # performs the collection in parallel - -cat(glue("Creating first harp date took: {timetaken(pt)}")) - -if(F) { - # data.table syntax - harp[,.(first_harp_date = min(as.Date(monthly.rpt.prd, "%m/%d/%Y"))), loan_id, - keep=c("loan_id", "monthly.rpt.prd")] -} - -harp1[,before_12m_first_harp_date:=first_harp_date] -# 164 seconds -system.time(lubridate::month(harp1$before_12m_first_harp_date) <- lubridate::month(harp1$before_12m_first_harp_date) - 12) -harp1[,harp_12m := TRUE] - -# break it out into smaller chunks for even faster merging -# took 27 -pt <- proc.time() -harp2 <- shard( - harp1, - "loan_id", - nchunks = nchunks(fmdf), - outdir = file.path(outpath, "first_harp_date.df"), - overwrite=T) -cat(glue::glue("sharding HARP defaults took {timetaken(pt)}")) - -# no need to hard group by it's already sharded by loan_id -# took about 503 seconds -# took about 11:52 -pt <- proc.time() -defaults <- cmap(fmdf, function(df) { - # create the default flag - df[,date:=as.Date(monthly.rpt.prd,"%m/%d/%Y")] - setkey(df,loan_id, date) - df[,delq.status := as.integer(delq.status)] - df[,default:=delq.status>=3] - df[is.na(default),default:=F] - - df2 = df[default==T,.(loan_id, start_date =date, end_date = date)] - lubridate::month(df2$start_date) <- lubridate::month(df2$start_date)-12 - - # get rid of overlaps - df2[order(start_date),lag_end_date := shift(end_date,1), loan_id] - - df2[,grp_incr := 1] - df2[start_date <= lag_end_date, grp_incr := 0, loan_id] - - df2[,grp:=cumsum(grp_incr)] - df3 = df2[,.(start_date = min(start_date), end_date = max(end_date), default_12m=T),.(loan_id,grp)] - - df3[,grp:=NULL] - - # create the hardship flag - # whether the customer goes into in the next 12 months hardship - df3 - }, - keep = c("monthly.rpt.prd", "delq.status", "loan_id"), - outdir = file.path(outpath, "defaults.df"), - lazy = F, - overwrite = T) -cat(glue::glue("Creating forward looking default flag took {timetaken(pt)}")) - -# -# a = df[loan_id == "100513171914", ] -# -# a[,monthly.rpt.prd := as.Date(monthly.rpt.prd,"%m/%d/%Y")] -# a[,monthly.rpt.prd2:=monthly.rpt.prd] -# -# -# setkey(df3,"loan_id","start_date","end_date") -# -# a1= foverlaps(a, df3, by.x = c("loan_id","monthly.rpt.prd","monthly.rpt.prd2"), -# by.y=c("loan_id","start_date","end_date")) -# -# a1[,.(loan_id,monthly.rpt.prd,start_date, end_date)] - -# create a list that show defaults ---------------------------------------- -# system.time(defaults <- chunk_lapply(fmdf, function(df) { -# df[,date:=as.Date(monthly.rpt.prd,"%m/%d/%Y")] -# df[,delq.status := as.integer(delq.status)] -# df[,default:=delq.status>=3] -# df[is.na(default),default:=F] -# df1 = df[default == F] -# df1 -# }, outdir="tmp1")) diff --git a/tutorials/fannie_mae_new/02b_foverlaps.r b/tutorials/fannie_mae_new/02b_foverlaps.r deleted file mode 100644 index 215c789..0000000 --- a/tutorials/fannie_mae_new/02b_foverlaps.r +++ /dev/null @@ -1,45 +0,0 @@ -# 2_exploratory.r -source("tutorials/fannie_mae/00_setup.r") - -fmdf = disk.frame(file.path(outpath, "fm.df")) -tmp2 = disk.frame(file.path(outpath, "defaults.df")) -harp2 = disk.frame(file.path("first_harp_date.df")) - -fmdf_lazy = delayed(fmdf, function(df) { - df[,monthly.rpt.prd:=as.Date(monthly.rpt.prd,"%m/%d/%Y")] - df[,monthly.rpt.prd2:=monthly.rpt.prd] -}) - -tmp2_lazy = delayed(tmp2, function(df) { - setkey(df, "loan_id","start_date","end_date") -}) - -harp2_lazy = delayed(harp2, function(df) { - setkey(df, "loan_id","before_12m_first_harp_date","first_harp_date") -}) - -# took 513 seconds -pt <- proc.time() -tmp3 <- foverlaps.disk.frame( - fmdf_lazy, - tmp2_lazy, - outdir=file.path(outpath, "fm_with_default"), - by.x = c("loan_id", "monthly.rpt.prd", "monthly.rpt.prd2"), - by.y = c("loan_id", "start_date", "end_date"), - merge_by_chunk_id = T, - overwrite = T -) -cat(glue::glue("time taken to merge on default flag {timetaken(pt)}")) - -# took 6:55 -pt <- proc.time() -tmp4 <- foverlaps.disk.frame( - tmp3, - harp2_lazy, - outdir=file.path(outpath, "fm_with_harp"), - by.x = c("loan_id", "monthly.rpt.prd", "monthly.rpt.prd2"), - by.y = c("loan_id", "before_12m_first_harp_date", "first_harp_date"), - merge_by_chunk_id = T, - overwrite = T -) -cat(glue::glue("time taken to merge on HARP flag {timetaken(pt)}")) \ No newline at end of file diff --git a/tutorials/fannie_mae_new/02b_foverlaps_Reduced16.r b/tutorials/fannie_mae_new/02b_foverlaps_Reduced16.r deleted file mode 100644 index 825d28d..0000000 --- a/tutorials/fannie_mae_new/02b_foverlaps_Reduced16.r +++ /dev/null @@ -1,68 +0,0 @@ -# 2_exploratory.r -source("inst/fannie_mae/0_setup.r") -pt <- proc.time() - -fmdf = disk.frame("only16.df") -tmp2 = disk.frame("defaults.df") # tmp2 contains all the defaults -harp2 = disk.frame("first_harp_date.df") - -fmdf_lazy = delayed(fmdf, function(df) { - df[,monthly.rpt.prd:=as.Date(monthly.rpt.prd,"%m/%d/%Y")] - df[,monthly.rpt.prd2:=monthly.rpt.prd] -}) - -tmp2_lazy = delayed(tmp2, function(df) { - setkey(df, "loan_id","start_date","end_date") -}) - -harp2_lazy = lazy(harp2, function(df) { - setkey(df, "loan_id","before_12m_first_harp_date","first_harp_date") -}) - -#dh <- rbindlist.disk.frame(tmp2_lazy, harp2_lazy, outdir ="defaults_harp.df", by_chunk_id = T) -#plan(transparent) -system.time(tmp3 <- foverlaps.disk.frame( - fmdf_lazy, - tmp2_lazy, - outdir="tmp3_16", - by.x = c("loan_id", "monthly.rpt.prd", "monthly.rpt.prd2"), - by.y = c("loan_id", "start_date", "end_date"), - merge_by_chunk_id = T, - compress = 50 -)) - - -system.time(tmp4 <- foverlaps.disk.frame( - tmp3, - harp2_lazy, - outdir="tmp4_16", - by.x = c("loan_id", "monthly.rpt.prd", "monthly.rpt.prd2"), - by.y = c("loan_id", "before_12m_first_harp_date", "first_harp_date"), - merge_by_chunk_id = T, - compress = 50 -)) - -data.table::timetaken(pt) - - -tmp4 <- disk.frame("tmp4_16") -system.time(tmp5 <- tmp4[,.(ndef=sum(default_12m, na.rm=T),.N), .(monthly.rpt.prd, delq.status), - keep=c("default_12m","monthly.rpt.prd","delq.status")]) - - -tmp5[, delq.status.cap := pmin(5, as.integer(delq.status))] -tmp5[delq.status == "X", delq.status.cap := 0] -tmp5[delq.status == "", delq.status.cap := 0] - - -tmp6 <- tmp5[,.(ndef=sum(ndef), N=sum(N)),.(monthly.rpt.prd, delq.status.cap)] - - -tmp6[,defr:=ndef/N] - -library(ggplot2) -tmp6 %>% - ggplot + - geom_line(aes(x=monthly.rpt.prd, y = defr, colour=as.factor(delq.status.cap))) - - diff --git a/tutorials/fannie_mae_new/02c_plot_odr.r b/tutorials/fannie_mae_new/02c_plot_odr.r deleted file mode 100644 index 4521511..0000000 --- a/tutorials/fannie_mae_new/02c_plot_odr.r +++ /dev/null @@ -1,45 +0,0 @@ -# 2_exploratory.r -source("inst/fannie_mae/0_setup.r") - -fm_with_harp = disk.frame(file.path(outpath, "fm_with_harp")) - -head(fm_with_harp) - -# need a two stage summary -system.time(a_wh1 <- fm_with_harp %>% - srckeep(c("default_12m","harp_12m","monthly.rpt.prd")) %>% - group_by(monthly.rpt.prd, hard = F) %>% - summarise( - N = n(), - n_defaults = sum(default_12m, na.rm = T), - n_goto_harp = sum(harp_12m, na.rm=T)) %>% - collect(parallel = T) %>% - group_by(monthly.rpt.prd) %>% - summarise( - odr = sum(n_defaults)/sum(N), - oh = sum(n_goto_harp)/sum(N) - ) %>% - rename( - Date = monthly.rpt.prd, - `Observed Default Rate%` = odr, - `HARP Conversion Rate%` = oh - )) - - -if(F) { - # data.table syntax - system.time(a_wh <- tmp4[ - ,.(.N, n_defaults = sum(default_12m, na.rm = T), ) - ,.(monthly.rpt.prd), keep=c("default_12m","harp_12m","monthly.rpt.prd")]) - - a_wh1 = a_wh[,.(odr = sum(n_defaults)/sum(N), oh = sum(n_goto_harp)/sum(N)), monthly.rpt.prd] - setnames(a_wh1, c("monthly.rpt.prd","odr", "oh"), c("Date", "Observed Default Rate%", "HARP Conversion Rate%")) -} - -a_wh2 = a_wh1 %>% gather(key = type, value=rate, -Date) - -ggplot(a_wh2) + - geom_line(aes(x=Date, y = rate, colour = type)) + - ggtitle("Fannie Mae Observed Default Rate over time & HARP Conversion Rate") - - diff --git a/tutorials/fannie_mae_new/02d_harp_check.r b/tutorials/fannie_mae_new/02d_harp_check.r deleted file mode 100644 index f9fd020..0000000 --- a/tutorials/fannie_mae_new/02d_harp_check.r +++ /dev/null @@ -1,33 +0,0 @@ -#2d_harp_check.r -source("inst/fannie_mae/0_setup.r") - -system.time(harp <- fread("D:/data/fannie_mae/HARP_Files/Performance_HARP.txt", select = c("V1","V2"), colClasses = "c")) -setnames(harp,names(harp), c("harp_id","month")) - -harp_mapping <- fread("D:/data/fannie_mae/HARP_Files/Loan_Mapping.txt", colClasses = "c", col.names = c("loan_id", "harp_id")) -fst::write_fst(harp_mapping,"harp_mapping.fst") - -harp_mapped = merge(harp, harp_mapping, by="harp_id") -c(nrow(harp), nrow(harp_mapped)) - -harp_acq <- fread("d:/data/fannie_mae/HARP_Files/Acquisition_HARP.txt") -fst::write_fst(harp_mapping,"harp_mapping.fst") - -harp[,date:=as.Date(month,"%m/%d/%Y")] -harp[,.N,date][order(date)] - -fmdf = disk.frame("fmdf") - -system.time(uid <- fmdf[,.(loan_id = unique(loan_id)), keep = "loan_id"][,unique(loan_id)]) - -harp_uid = unique(harp_mapped$loan_id) - -def = intersect(uid, harp_uid) -def - -system.time(harp.df <- shard(harp,"loan_id", nchunks = nchunks(fmdf), outdir = "harp.df", overwrite = T)) - -system.time(fmdfh <- rbindlist.disk.frame(list(fmdf, harp.df), outdir = "fmdf1_w_harp")) - -# nothing in the intersection -# so either HARP ids were changed once there are in the harp program \ No newline at end of file diff --git a/tutorials/fannie_mae_new/10_a_one_var.R b/tutorials/fannie_mae_new/10_a_one_var.R deleted file mode 100644 index fae7540..0000000 --- a/tutorials/fannie_mae_new/10_a_one_var.R +++ /dev/null @@ -1,191 +0,0 @@ -source("inst/fannie_mae/0_setup.r") -library(disk.frame) - -acqall1 = disk.frame(file.path(outpath, "appl_mdl_data_sampled_dev2")) - -# XGBoost on one var ------------------------------------------------------ -system.time(xy <- acqall1[,c("default_next_12m", "oltv"), keep=c("default_next_12m", "oltv")]) - -# transform into format accepted by XGBoost -dtrain <- xgb.DMatrix(label = xy$default_next_12m, data = as.matrix(xy[,"oltv"])) - -portfolio_default_rate = xy[,sum(default_next_12m, na.rm = T)/.N] - -pt = proc.time() -m <- xgboost( - data=dtrain, - nrounds = 1, - objective = "binary:logitraw", - tree_method="exact", - monotone_constraints = 1, - base_score = portfolio_default_rate) -timetaken(pt) - -prev_pred = predict(m, dtrain) - -map_chr(xgb.dump(m), ~str_extract(.x,"\\[f0<[\\d]+\\.[\\d]+\\]")) %>% - keep(~!is.na(.x)) %>% - map_dbl(~str_extract(.x, "[\\d]+\\.[\\d]+") %>% as.numeric) %>% - sort %>% - floor -> bins - -bb = xy[,.(ndef = sum(default_next_12m), .N, m1 = min(oltv), m2 = max(oltv)), .(bins = cut(oltv,c(-Inf,bins,Inf)))] -new_bins = sort(unique( bb$m2)) -bb = xy[,.(ndef = sum(default_next_12m), .N), .(bins = cut(oltv,c(-Inf,new_bins,Inf)))][order(bins)] - -setkey(bb, bins) -bb %>% - filter(!is.na(bins)) %>% - mutate(`Orig LTV Band` = bins, `Default Rate%` = ndef/N) %>% - ggplot + - geom_bar(aes(x = `Orig LTV Band`, y = `Default Rate%`), stat = 'identity') + - coord_flip() - -if(F) { - system.time(xyz <- acqall1[,c("default_next_12m", "oltv", "frst_dte"), keep=c("default_next_12m", "oltv", "frst_dte")]) - - xyz[,frst_yr := substr(frst_dte,4,7) %>% as.integer] - xyz[,frst_dte:=NULL] - - bb = xyz[,.(ndef = sum(default_next_12m), .N), .(bins = cut(oltv,c(-Inf,bins,Inf)), frst_yr)] - - bb[,binton := sum(N), bins] - bb[,dr := ndef/binton] - - setkey(bb, bins) - bb[order(frst_yr,decreasing = T),text_y_pos := cumsum(dr) - dr/2, bins] - bb[,text := substr(frst_yr, 3,4)] - - bb %>% - filter(!is.na(bins)) %>% - mutate(`Yr of Orig` = as.factor(frst_yr), `Orig LTV Band` = bins, `Default Rate%` = dr) %>% - ggplot + - geom_bar(aes(x = `Orig LTV Band`, y = `Default Rate%`, fill = `Yr of Orig`), stat = 'identity') + - geom_text(aes(x = `Orig LTV Band`, y = text_y_pos, label = text)) + - coord_flip() -} - - -# what if we fit them in rounded number --------------------------------------------------------- - -xy[oltv > 80, oltv_round := ceiling(oltv/5)*5] -xy[oltv <= 80, oltv_round := ceiling(oltv/10)*10] -xy[oltv <= 40, oltv_round := ceiling(oltv/20)*20] -#xy = xy[!is.na(oltv_round),] - -xy[is.na(oltv_round),] - -xy[,.N, oltv_round] - -# transform into format accepted by XGBoost -dtrain <- xgb.DMatrix(label = xy$default_next_12m, data = as.matrix(xy[,"oltv_round"])) - -pt = proc.time() -m <- xgboost( - data=dtrain, - nrounds = 1, - objective = "binary:logitraw", - tree_method="exact", - monotone_constraints = 1, - base_score = portfolio_default_rate) -timetaken(pt) - -map_chr(xgb.dump(m), ~str_extract(.x,"\\[f0<[\\d]+[\\.]{0,1}[\\d]+\\]")) %>% - keep(~!is.na(.x)) %>% - map_dbl(~str_extract(.x, "[\\d]+[\\.]{0,1}[\\d]+") %>% as.numeric) %>% - sort -> bins - -bb = xy[,.(ndef = sum(default_next_12m), .N, m1 = min(oltv_round), m2 = max(oltv_round)), .(bins = cut(oltv_round,c(-Inf,bins,Inf)))] -new_bins = sort(unique( bb$m2)) -bb = xy[,.(ndef = sum(default_next_12m), .N), .(bins = cut(oltv_round,c(-Inf,new_bins,Inf)))][order(bins)] -bb[,odr := ndef/N] - -setkey(bb, bins) -bb %>% - filter(!is.na(bins)) %>% - mutate(`Orig LTV Band` = bins, `Default Rate%` = ndef/N) %>% - ggplot + - geom_bar(aes(x = `Orig LTV Band`, y = `Default Rate%`), stat = 'identity') + - coord_flip() - - -prev_pred = predict(m, dtrain) - -prev_pred1 = predict(m, dtrain, predcontrib=T) - -# add another variable ---------------------------------------------------- -target = "default_next_12m" -feature = "orig_amt" -df = acqall1 -format_fn = base::I -existing_model = prev_pred -monotone_constraints = -1 - -# auc <- function(target, score) { -# df = data.table(target, score) -# -# df1 = df[,.(nt = sum(target), n = .N, score)] -# setkey(df1, score) -# } -# -# add_var_to_scorecard <- function(df, target, feature, monotone_constraints = 0, prev_pred = NULL, format_fn = base::I) { -# -# xy = df %>% -# srckeep(c(target, feature)) %>% -# collect(parallel = T) -# -# # evaluate -# code = glue::glue("xy = xy %>% mutate({feature} = format_fn({feature}))") -# eval(parse(text = code)) -# -# dtrain <- xgb.DMatrix(label = xy[,target, with = F][[1]], data = as.matrix(xy[,c(feature), with = F])) -# -# if(is.null(prev_pred)) { -# pt = proc.time() -# m2 <- xgboost( -# data=dtrain, -# nrounds = 1, -# objective = "binary:logitraw", -# tree_method="exact", -# monotone_constraints = monotone_constraints -# ) -# timetaken(pt) -# } else { -# setinfo(dtrain, "base_margin", prev_pred) -# pt = proc.time() -# m2 <- xgboost( -# data=dtrain, -# nrounds = 1, -# objective = "binary:logitraw", -# tree_method="exact", -# monotone_constraints = monotone_constraints -# ) -# timetaken(pt) -# -# a2 = predict(m2, dtrain) -# a3 = predict(m2, dtrain, predcontrib = T) -# } -# -# map_chr(xgb.dump(m2), ~str_extract(.x,"\\[f0<[\\d]+[\\.]{0,1}[\\d]+\\]")) %>% -# keep(~!is.na(.x)) %>% -# map_dbl(~str_extract(.x, "[\\d]+[\\.]{0,1}[\\d]+") %>% as.numeric) %>% -# sort -> bins -# -# code = glue::glue("bb = xy[,.(ndef = sum({target}), .N, m1 = min({feature}), m2 = max({feature})), .(bins = cut({feature},c(-Inf,bins,Inf)))]") -# eval(parse(text = code)) -# -# new_bins = sort(unique(bb$m2)) -# code1 = glue::glue("bb = xy[,.(ndef = sum(default_next_12m), .N), .(bins = cut({feature},c(-Inf,new_bins,Inf)))][order(bins)]") -# eval(parse(text = code1)) -# -# setkey(bb, bins) -# bb %>% -# filter(!is.na(bins)) %>% -# mutate(`Orig LTV Band` = bins, `Default Rate%` = ndef/N) %>% -# ggplot + -# geom_bar(aes(x = `Orig LTV Band`, y = `Default Rate%`), stat = 'identity') + -# coord_flip() -# -# -# } - diff --git a/tutorials/fannie_mae_new/10_a_one_var_with_fn.R b/tutorials/fannie_mae_new/10_a_one_var_with_fn.R deleted file mode 100644 index 687abcc..0000000 --- a/tutorials/fannie_mae_new/10_a_one_var_with_fn.R +++ /dev/null @@ -1,56 +0,0 @@ -source("inst/fannie_mae/0_setup.r") -library(disk.frame) - -acqall1 = disk.frame(file.path(outpath, "appl_mdl_data_sampled_dev2")) - -pt <- proc.time() -firstm <- add_var_to_scorecard(acqall1, "default_next_12m", "oltv", monotone_constraints = 1, format_fn = function(v) { - ceiling(v / 5) * 5 -}) -timetaken(pt) - -firstm - -# now add the second variable --------------------------------------------- -pt <- proc.time() -secondm <- add_var_to_scorecard( - acqall1, - "default_next_12m", - "dti", - prev_pred = firstm$prev_pred, - monotone_constraints = 1) -timetaken(pt) - -pt <- proc.time() -thirdm <- add_var_to_scorecard( - acqall1, - "default_next_12m", - "orig_trm", - prev_pred = secondm$prev_pred, - monotone_constraints = 1) -timetaken(pt) - -pt <- proc.time() -fourthm <- add_var_to_scorecard( - acqall1, - "default_next_12m", - "num_bo", - prev_pred = thirdm$prev_pred, - monotone_constraints = -1, - format_fn = as.numeric) -timetaken(pt) - -pt <- proc.time() -fifthm <- add_var_to_scorecard( - acqall1, - "default_next_12m", - "cscore_b", - prev_pred = fourthm$prev_pred, - monotone_constraints = -1) -timetaken(pt) - - -# plot improvement -------------------------------------------------------- -plot(map_dbl(list(firstm, secondm, thirdm, fourthm, fifthm), ~.x$auc), type="b", ylim = c(0.4,0.9)) -abline(h = 0.7, lty = 2) - diff --git a/tutorials/fannie_mae_new/10_b_two_vars.r b/tutorials/fannie_mae_new/10_b_two_vars.r deleted file mode 100644 index d0df6cd..0000000 --- a/tutorials/fannie_mae_new/10_b_two_vars.r +++ /dev/null @@ -1,61 +0,0 @@ -source("inst/fannie_mae/0_setup.r") -library(disk.frame) - -acqall1 = disk.frame(file.path(outpath, "appl_mdl_data_sampled_dev2")) - -pt <- proc.time() -firstm <- add_var_to_scorecard( - acqall1, - "default_next_12m", - "oltv", - monotone_constraints = 1, - format_fn = function(v) { - ceiling(v / 5) * 5 - }) -timetaken(pt) - -firstm - -# now add the second variable --------------------------------------------- -pt <- proc.time() -secondm <- add_var_to_scorecard( - acqall1, - "default_next_12m", - "dti", - prev_pred = firstm$prev_pred, - monotone_constraints = 1) -timetaken(pt) - -pt <- proc.time() -thirdm <- add_var_to_scorecard( - acqall1, - "default_next_12m", - "orig_trm", - prev_pred = secondm$prev_pred, - monotone_constraints = 1) -timetaken(pt) - -pt <- proc.time() -fourthm <- add_var_to_scorecard( - acqall1, - "default_next_12m", - "num_bo", - prev_pred = thirdm$prev_pred, - monotone_constraints = -1, - format_fn = as.numeric) -timetaken(pt) - -pt <- proc.time() -fifthm <- add_var_to_scorecard( - acqall1, - "default_next_12m", - "cscore_b", - prev_pred = fourthm$prev_pred, - monotone_constraints = -1) -timetaken(pt) - - -# plot improvement -------------------------------------------------------- -#plot(map_dbl(list(firstm, secondm, thirdm, fourthm, fifthm), ~.x$auc), type="b", ylim = c(0.4,0.9)) -#abline(h = 0.7, lty = 2) - diff --git a/tutorials/fannie_mae_new/10a_AutoML_XGBoost_Scorecards.r b/tutorials/fannie_mae_new/10a_AutoML_XGBoost_Scorecards.r deleted file mode 100644 index 6f7a471..0000000 --- a/tutorials/fannie_mae_new/10a_AutoML_XGBoost_Scorecards.r +++ /dev/null @@ -1,90 +0,0 @@ -source("inst/fannie_mae/0_setup.r") -library(disk.frame) - -acqall_dev = disk.frame(file.path(outpath, "appl_mdl_data_sampled_dev2")) - - -# 5d develop a function to test all variables -check_which_is_best <- function(df, target, features, monotone_constraints, format_fns, weight=NULL) { - #browser() - prev_pred = NULL - ws = NULL - vars_scr = NULL - - while(length(features) > 0) { - # try every var - #browser() - res = furrr::future_map(1:length(features), ~add_var_to_scorecard( - #res = purrr::map(1:length(features), ~add_var_to_scorecard( - df, - target, - features[.x], - monotone_constraints = monotone_constraints[.x], - prev_pred = prev_pred, - format_fn = format_fns[[.x]], - weight, - save_model_fname = glue::glue("{features[.x]}.xgbm") - )) - - # which has the best auc - w = which.max(map_dbl(res, ~.x$auc)) - ws = c(ws, w) - feature_to_fit = features[w] - print(glue::glue("choosen: {feature_to_fit}")) - - var_scr1 = res[[w]] - eval(parse(text = glue::glue("vars_scr = c(vars_scr, list({feature_to_fit} = var_scr1))"))) - - features = features[-w] - format_fns = format_fns[-w] - monotone_constraints = monotone_constraints[-w] - - prev_pred = var_scr1$prev_pred - } - vars_scr -} - -num_vars = c("mi_pct", "orig_amt", "orig_rt", "ocltv", "mi_type", "cscore_c", "oltv", "dti", "orig_trm", "cscore_b", "num_bo", "num_unit", "orig_dte", "frst_dte") -num_vars_mon = c(0 , 1 , 1 , 1 , 0 , -1 , 1 , 1 , 1 , -1 , -1 , 0 , 0 , 0) -num_var_fmt_fn = c(map(1:10, ~base::I), map(1:2, ~as.numeric), map(1:2, ~function(x) { - as.numeric(substr(x, 4, 7)) -})) - -cat_vars = setdiff(names(acqall_dev), num_vars) %>% - setdiff( - c("loan_id", "default_next_12m", "first_default_date", "mths_to_1st_default", "zip_3", "weight")) - -pt = proc.time() -res_all = check_which_is_best( - acqall_dev, - "default_next_12m", - features = c(num_vars, cat_vars), - monotone_constraints = c(num_vars_mon, rep(0, length(cat_vars))), - format_fns = c(num_var_fmt_fn, map(1:length(cat_vars), ~base::I)), - weight = "weight" -) -timetaken(pt) - -plot(map_dbl(res_all, ~.x$auc)) - -#saveRDS(res_all, "model.rds") - -if(F) { - rescat = check_which_is_best( - acqall_dev, - "default_next_12m", - c("seller.name", "oltv", "state", "dti", "mi_type"), - c(0 , 1 , 0 , 1 , 0 ), - map(1:5, ~base::I) - ) - - - system.time(res <- check_which_is_best( - acqall1, - "default_next_12m", - c("ocltv", "cscore_c", "mi_type", "oltv", "dti", "orig_trm", "cscore_b", "num_bo"), - c(1 , -1 , 0 , 1 , 1 , 1 , -1 , -1 ), - c(map(1:7, ~base::I), list(as.numeric)))) -} - - diff --git a/tutorials/fannie_mae_new/10b_make_scorecard_visible.R b/tutorials/fannie_mae_new/10b_make_scorecard_visible.R deleted file mode 100644 index 36fb832..0000000 --- a/tutorials/fannie_mae_new/10b_make_scorecard_visible.R +++ /dev/null @@ -1,109 +0,0 @@ -source("inst/fannie_mae/0_setup.r") -library(disk.frame) - -acqall_val = disk.frame(file.path(outpath, "appl_mdl_data_sampled_val2")) -df = acqall_val -#the model -mdl = readRDS("model.rds") - -plot(purrr::map_dbl(mdl, ~.x$auc)) - -#' score one variable -score_one_var <- function(feature, x, bins, bias, format_fn = base::I) { - print(feature) - # numeric features (post format_fn) has only two columns in bins - if(ncol(bins) == 2) { - x1 = format_fn(x) - binsNA = evalparseglue("bins[is.na({feature}),]") - bins = evalparseglue("bins[!is.na({feature}),]") - score = bins$score[cut(x1, c(-Inf, bins[[2]], Inf))] - - if(nrow(binsNA) == 1) { - score[is.na(x1)] = binsNA[[1]] - } - } else { - x1 = format_fn(x) - x1df = evalparseglue("data.table({feature} = x1)") - x1df_scr = evalparseglue("merge(x1df, bins, by = '{feature}', all.x=T)") - score = x1df_scr$score - } - # if there are still na then assign the bias - #browser(expr = any(is.na(score))) - if(any(is.na(score))) { - warning(glue::glue("the feature `{feature}` has {sum(is.na(score))} unassigned scores out of {length(score)}, assigning them the bias = {bias}")) - score[is.na(score)] = bias - } - - score -} - - -#' Socre a scorecard -#' @param mdl the xg scorecard model -#' @param df a disk.frame -score_xg_scorecard <- function(df, mdl) { - res = furrr::future_map_dfc(mdl, ~{ - #res = purrr::map_dfc(mdl, ~{ - var = df %>% - srckeep(.x$feature) %>% - collect(parallel = F) - x = var[[1]] - feature = .x$feature - bins = .x$bins - bias = .x$bias - format_fn = .x$format_fn - score_one_var(feature, x, bins, bias, format_fn) - }) %>% rowSums - res -} - -plot_auc <- function(df) { - target = df %>% - srckeep("default_next_12m") %>% - collect(parallel = F) - - pt = proc.time() - target$score = score_xg_scorecard(df, mdl) - timetaken(pt) - - AUC = auc(target$default_next_12m, target$score) - GINI = 2*AUC-1 - - target[,negscore := -score] - setkey(target, negscore) - cts = target[,.(ctot = (1:.N)/.N, cbad = cumsum(default_next_12m)/sum(default_next_12m))] - - plot(cts[seq(1,.N, length.out=100),], type="l", xlim=c(0,1), - main=glue::glue("On Validation - AUC: {AUC %>% round(2)}; GINI: {GINI %>% round(2)}")) - abline(a=0,b=1,lty=2) - abline(h=0) - abline(v=0) -} - - -system.time(plot_auc(acqall_val)) - -# score on whole ---------------------------------------------------------- -acqall_dev = disk.frame(file.path(outpath, "appl_mdl_data_sampled_dev2")) -system.time(plot_auc(acqall_dev)) - -scorecard = map_dfr(mdl, ~{ - res = .x$bins - evalparseglue("res[,feature_lbl := as.character({.x$feature})]") - res[,variable := .x$feature] - res %>% - select(variable, feature_lbl, score ) %>% - mutate(score = round(-score*20/log(2))) -}) - -saveRDS(scorecard, "scorecard.rds") -scorecard = readRDS("scorecard.rds") -View(scorecard) - -DT::datatable(scorecard) - -# score on whole ---------------------------------------------------------- -# acqall = disk.frame(file.path(outpath, "appl_mdl_data")) -# system.time(plot_auc(acqall)) - - diff --git a/tutorials/fannie_mae_new/11_speedglm.R b/tutorials/fannie_mae_new/11_speedglm.R deleted file mode 100644 index a7355ac..0000000 --- a/tutorials/fannie_mae_new/11_speedglm.R +++ /dev/null @@ -1,50 +0,0 @@ -source("inst/fannie_mae/0_setup.r") -library(disk.frame) -library(speedglm) -library(biglm) - -acqall_dev = disk.frame(file.path(outpath, "appl_mdl_data_sampled_dev2")) -library(speedglm) - -head(acqall_dev) - -#' A streaming function for speedglm -#' @param df a disk.frame -stream_shglm <- function(df) { - i = 0 - is = sample(nchunks(df), replace = F) - function(reset = F) { - #browser() - if(reset) { - print("you've reset") - i <<- 0 - } else { - i <<- i + 1 - #if (i > 3) { - if (i > nchunks(df)) { - return(NULL) - } - print(glue::glue("{i}/{nchunks(df)}")) - return(get_chunk(df, is[i])) - #return(get_chunk(df, i)) - } - } -} - -acqall_dev1 = acqall_dev %>% delayed(~{ - .x[,oltv_band := cut(oltv, c(-Inf, seq(60,100,by=20)))] - .x[,dti_band := addNA(cut(dti, c(-Inf, c(20,100), Inf)))] - .x -}) - -streamacq = stream_shglm(acqall_dev1) - -m = bigglm( - default_next_12m ~ oltv_band-1, - data = streamacq, - family=binomial()) -summary(m) - -shglm(default_next_12m ~ oltv_band -1 #+ dti_band + - 1 - , - datafun = streamacq, family=binomial()) \ No newline at end of file diff --git a/tutorials/fannie_mae_new/12_keras.r b/tutorials/fannie_mae_new/12_keras.r deleted file mode 100644 index ba164e2..0000000 --- a/tutorials/fannie_mae_new/12_keras.r +++ /dev/null @@ -1,178 +0,0 @@ -source("inst/fannie_mae/0_setup.r") -library(disk.frame) -library(keras) - -acqall_dev = disk.frame(file.path(outpath, "appl_mdl_data_sampled_dev2")) - -#' A streaming function for speedglm -#' @param df a disk.frame -stream_shglm <- function(df) { - i = 0 - is = sample(nchunks(df), replace = F) - function(reset = F) { - #browser() - if(reset) { - print("you've reset") - i <<- 0 - } else { - i <<- i + 1 - #if (i > 4) { - if (i > nchunks(df)) { - return(NULL) - } - print(glue::glue("{i}/{nchunks(df)}")) - return(get_chunk(df, is[i])) - #return(get_chunk(df, i)) - } - } -} - -acqall_dev1 = acqall_dev %>% delayed(~{ - .x[,oltv_band := cut(oltv, c(-Inf, 60,80, Inf))] - #.x[,scr_band := addNA(cut(cscore_b, c(-Inf, 627,700, Inf)), ifany=F)] - .x[,scr_band := addNA(cut(cscore_b, c(-Inf, 700,716,725,742,748,766,794, Inf)), ifany=F)] - - .x -}) - -if(F) { - aa = acqall_dev1 %>% collect - glm(default_next_12m ~ oltv_band + scr_band - 1, data=aa) -} - -head(acqall_dev1) - -build_model <- function() { - model <- keras_model_sequential() %>% - layer_dense(units = 2, activation = 'softmax') - - - model %>% compile( - loss = "categorical_crossentropy", - optimizer = 'sgd' - ) - - model -} - -model = build_model() - -#ol = levels(acqall_dev1[,oltv_band, keep=c("oltv","cscore_b")]) -#dl = levels(acqall_dev1[,scr_band, keep=c("oltv","cscore_b")]) - -ol = levels(get_chunk(acqall_dev1,1)[,oltv_band]) -dl = levels(get_chunk(acqall_dev1,1)[,scr_band]) - - -#for(i in 1:nchunks(acqall_dev1)) { -kk <- function() { - #browser() - j = 0 - done = F - ii = 0 - while(!done) { - j <- j + 1 - si = sample(nchunks(acqall_dev1), nchunks(acqall_dev1)*0.7) - osi = setdiff(1:nchunks(acqall_dev1), si) - #browser() - system.time(a <- map_dfr(si, ~{ - #browser() - ii <- ii + 1 - i = .x - if(ii %% 20 == 0) print(glue::glue("{j}:{ii} {Sys.time()}")) - - a = get_chunk(acqall_dev1, i) - #a[,.(sum(default_next_12m), .N),oltv_band] - - a1 = - cbind( - a[,keras::to_categorical(as.integer(oltv_band) - 1)] - #,a[,keras::to_categorical(as.integer(scr_band) - 1)] - ) - - at = a[,default_next_12m*1] - Y_train = keras::to_categorical(at) - - hist = model %>% fit( - a1, - Y_train, - epochs = 1, - validation_split = 0.2, - verbose = 0 - ) - - gw = get_weights(model) - - gwdt = as.data.table(gw[1]) - setnames(gwdt, names(gwdt), c("non_default", "default")) - gwdt$i =i - gwdt$var = c( - rep("oltv_band", length(ol)) - #, rep("scr_band", length(dl)) - ) - gwdt$band = c(ol,dl - ) - gwdt - })) - - some_chunks <- map(osi, ~{ - i = .x - a = get_chunk(acqall_dev1, i) - - a1 = - cbind( - a[,keras::to_categorical(as.integer(oltv_band) -1)] - #,a[,keras::to_categorical(as.integer(scr_band)-1)] - ) - a1 - }) %>% reduce(rbind) - - outcomes <- map(osi, ~{ - i = .x - a = get_chunk(acqall_dev, i, keep=c("default_next_12m")) - a[,default_next_12m] - }) %>% unlist - - auc = auc(outcomes, predict(model, some_chunks)[,2]) - - # scores - p = predict(model, diag( - length(ol) - #+length(dl) - ))[,2] - a_b = log(p/(1-p)) - - scalar = 20/log(2) - intercept = 20*log(536870912/15)/log(2) - - a = mean(a_b) - b = a_b - a - done <- length(unique(sign(diff(b)))) == 1 - - scrs = data.table(base_scr = round(a*scalar+intercept,0), scr = round(-b*scalar,0)) - scrs$var = c( - rep("oltv_band", length(ol)) - #, rep("scr_band", length(dl)) - ) - scrs$band = c(ol - , dl - ) - scrs = scrs[band != "bias"] - - scrs = scrs[,.(var, band, scr, base_scr)] - print(glue::glue("AUC: {auc}")) - print(scrs) - } - scrs -} - -pt = proc.time() -scrs = kk() -timetaken(pt) -View(scrs) - -#AUC: 0.599790504924901 -# var band scr base_scr -# 1: oltv_band (-Inf,60] 10 351 -# 2: oltv_band (60,80] 10 351 -# 3: oltv_band (80, Inf] -21 351 \ No newline at end of file diff --git a/tutorials/fannie_mae_new/13_disk.r b/tutorials/fannie_mae_new/13_disk.r deleted file mode 100644 index 10adc92..0000000 --- a/tutorials/fannie_mae_new/13_disk.r +++ /dev/null @@ -1,25 +0,0 @@ -source("inst/fannie_mae/0_setup.r") -library(disk.frame) -library(keras) - -fmdf = disk.frame(file.path(outpath, "fm_with_harp")) - -head(fmdf) - -nrow(fmdf) - -pt = proc.time() -default_rate_over_time <- fmdf %>% - srckeep(c("default_12m", "monthly.rpt.prd")) %>% - group_by(monthly.rpt.prd, hard = F) %>% - summarise(ndef = sum(default_12m, na.rm=T), n = n()) %>% - collect %>% - group_by(monthly.rpt.prd) %>% - summarise(ndef = sum(ndef), n = sum(n)) %>% - mutate(default_rate = ndef/n) -timetaken(pt) - -default_rate_over_time %>% - ggplot + - geom_line(aes(x = monthly.rpt.prd, y = default_rate)) + - ggtitle("Fannie Mae Default Rate%") diff --git a/tutorials/fannie_mae_new/3a_get_some_data.r b/tutorials/fannie_mae_new/3a_get_some_data.r deleted file mode 100644 index b86128a..0000000 --- a/tutorials/fannie_mae_new/3a_get_some_data.r +++ /dev/null @@ -1,113 +0,0 @@ -library(disk.frame) - -source("inst/fannie_mae/0_setup.r") - -a = disk.frame(file.path(outpath, "fm_with_harp")) - -pt = proc.time() -a1 <- a %>% - srckeep(c("loan_id","monthly.rpt.prd","delq.status", "default_12m")) %>% - delayed(~{ - setkey(.x, loan_id) - uid = sample_frac(.x[,.(loan_id = unique(loan_id))], 0.01) - .x[uid,] -}) %>% collect(parallel = T) -timetaken(pt) - -a1[delq.status != "X",delq.statusn := as.numeric(delq.status)] -a1[delq.status %in% c("","X"),delq.statusn := 0] - -#a1[, delq.statusn := pmin(delq.statusn, 3)] - -a1[,.N,delq.statusn] - -a1[is.na(delq.statusn)] - -# remove those that have already defulat -a2 = a1 - -a2[is.na(default_12m), default_12m := FALSE] - -a3 = a2[,.(loan_id, monthly.rpt.prd, delq.statusn, default_12m)] -setkey(a3, loan_id, monthly.rpt.prd) - -a3[,sum(default_12m)/.N,delq.statusn] - -# create worst delq status last 12 months -system.time(a4 <- a3[, shift(delq.statusn,n=1:12), by=loan_id]) - -eval(parse(text=glue::glue("a4[,worst_delq_last_12m := pmax({paste0(paste0('V', 1:12), collapse=',')}, na.rm=T)]"))) - -a5 = bind_cols(a3, a4[,.(worst_delq_last_12m)]) - -a6 = a5#[delq.statusn < 3,] - -a6[,worst_delq_last_12m_capped := pmin(worst_delq_last_12m, 6)] - -a6[,.N, worst_delq_last_12m_capped] - -a6devid = sample_frac(a6[,.(loan_id = unique(loan_id))], 0.7) - -a6dev = a6[a6devid] -a6val = a6[!a6devid] - -a6devpd = a6dev[,.(pd = sum(default_12m)/.N), worst_delq_last_12m_capped] - -a6val = left_join(a6val, a6devpd, by = "worst_delq_last_12m_capped") -a6dev = left_join(a6dev, a6devpd, by = "worst_delq_last_12m_capped") - -setkey(a6val, loan_id, monthly.rpt.prd) -setkey(a6dev, loan_id, monthly.rpt.prd) - -for_write <- function(a6val) { - a6val1 = a6val[as.Date("2015-06-01") < monthly.rpt.prd & monthly.rpt.prd <= as.Date("2016-06-01") & !is.na(worst_delq_last_12m_capped), ] - a6val1[,N:=.N,loan_id] - - a6val2 = a6val1[N == 12,] - setkey(a6val2, loan_id, monthly.rpt.prd) - - toremove = a6val2[,.SD[12,delq.statusn], loan_id] - - toremove1 = toremove[V1 >= 3, loan_id] - - a6val2 = a6val2[!(loan_id %in% toremove1), ] - - a6val2[order(loan_id), id := rleid(loan_id)] - setkey(a6val2, loan_id, monthly.rpt.prd) - a6val2 -} - - -a6val2 <- for_write(a6val) -a6dev2 <- for_write(a6dev) - - -a6dev2[,.N, delq.statusn] -a6val2[,.N, delq.statusn] - -fwrite(a6val2[,.(id, delq_status = delq.statusn, default_12m)], "val.csv") -fwrite(a6dev2[,.(id, delq_status = delq.statusn, default_12m)], "dev.csv") - - - -gini <- function(a6val) { - setkey(a6val, id, monthly.rpt.prd) - a6val = a6val[seq(12,.N, by=12),] - a6_auc = a6val[order(worst_delq_last_12m_capped), .(bads = sum(default_12m), tots =.N), pd] - - a6_auc[,score:=-pd] - setkey(a6_auc, score) - - a6_auc[order(score), height := bads/sum(bads)] - a6_auc[order(score), width := tots/sum(tots)] - plot(a6_auc[,.(cumsum(width), cumsum(height))], type="l") - abline(0,1) - a6_auc2 = bind_rows(data.table(height=0, width=0), a6_auc) - a6_auc2[,cheight := cumsum(height)] - a6_auc2[,cwidth := cumsum(width)] - 2*a6_auc2[,sum((cheight+lag(cheight))*c(0,diff(cwidth))/2, na.rm=T)]-1 -} - -gini(a6val2) -gini(a6dev2) - diff --git a/tutorials/fannie_mae_new/4a_read_appl.r b/tutorials/fannie_mae_new/4a_read_appl.r deleted file mode 100644 index 29483b2..0000000 --- a/tutorials/fannie_mae_new/4a_read_appl.r +++ /dev/null @@ -1,59 +0,0 @@ -source("tutorials/fannie_mae/0_setup.r") -library(disk.frame) - -fs::dir_delete(file.path(outpath, "acq.df")) - - -Acquisitions_Variables = c("LOAN_ID", "ORIG_CHN", "Seller.Name", "ORIG_RT", "ORIG_AMT", "ORIG_TRM", "ORIG_DTE" - ,"FRST_DTE", "OLTV", "OCLTV", "NUM_BO", "DTI", "CSCORE_B", "FTHB_FLG", "PURPOSE", "PROP_TYP" - ,"NUM_UNIT", "OCC_STAT", "STATE", "ZIP_3", "MI_PCT", "Product.Type", "CSCORE_C", "MI_TYPE", "RELOCATION_FLG") %>% tolower() - -Acquisition_ColClasses = c("character", "character", "character", "numeric", "numeric", "integer", "character", "character", "numeric", - "numeric", "character", "numeric", "numeric", "character", "character", "character", "character", "character", - "character", "character", "numeric", "character", "numeric", "numeric", "character") - - - -pt = proc.time() -acq <- zip_to_disk.frame( - acqzip_file_path, - outdir = file.path(outpath, "acq.df"), - col.names = Acquisitions_Variables, - colClasses = Acquisition_ColClasses, - shardby = "loan_id", - nchunks = nchunks(fmdf)) -timetaken(pt) - -pt = proc.time() -acqall <- rbindlist.disk.frame(acq, outdir = file.path(outpath, "acq_all.df")) -timetaken(pt) - -# took 50 minutes - -fmdf <- disk.frame(file.path(outpath,"fm_with_harp")) - - -pt = proc.time() -fmdf_all = left_join( - fmdf, - acqall, - by = "loan_id", - merge_by_chunk_id = T, - outdir = file.path(outpath, "fmdf_appl") - ) -timetaken(pt) - -acqall = disk.frame(file.path(outpath, "acq_all.df")) - -fmdf <- disk.frame(file.path(outpath,"fm_with_harp")) - - -for(i in 1:nchunks(fmdf)) { - print(i) - a1 = get_chunk(acqall,i, keep="loan_id") %>% unique - a2 = get_chunk(fmdf,i, keep="loan_id") %>% unique - stopifnot(nrow(a1) == nrow(a2)) - stopifnot(nrow(setdiff(a1,a2)) == 0) - stopifnot(nrow(setdiff(a2,a1)) == 0) -} - diff --git a/tutorials/fannie_mae_new/4b_exploratory.R b/tutorials/fannie_mae_new/4b_exploratory.R deleted file mode 100644 index 1cf4382..0000000 --- a/tutorials/fannie_mae_new/4b_exploratory.R +++ /dev/null @@ -1,92 +0,0 @@ -source("inst/fannie_mae/0_setup.r") -library(disk.frame) -#future::plan(multiprocess(workers = availableCores())) - -fmdf_all = disk.frame(file.path(outpath, "fmdf_appl")) - -head(fmdf_all) - -# how many delq.status are there ------------------------------------------ -# system.time(dscnt <- fmdf_all[,.N,delq.status, keep="delq.status"]) -#dscnt[order(delq.status),sum(N),delq.status %>% as.integer] - -system.time(dscnt2 <- fmdf_all %>% - srckeep(c("delq.status", "default_12m", "harp_12m")) %>% - group_by(delq.status, hard = F) %>% - summarise(n = n(), ndef = sum(default_12m, na.rm=T), nharp = sum(harp_12m, na.rm=T)) %>% - collect(parallel = T) %>% - group_by(delq.status) %>% - summarise(n = sum(n), ndef = sum(ndef), nharp = sum(nharp)) %>% - arrange(delq.status)) - - -dscnt2[delq.status %in% c("X","","0","1","2"),.(delq.status, ndef/n)] - -# how many default and go to HARP ----------------------------------------- -# system.time(hd <- fmdf_all[,.N, .(harp_12m,default_12m), keep=c("harp_12m","default_12m")]) -# hd[,.(N = sum(N)), .(harp_12m, default_12m)] - -# 45 seconds -system.time(hd <- fmdf_all %>% - srckeep(c("harp_12m", "default_12m", "delq.status")) %>% - filter(delq.status %in% c("X","","0","1","2")) %>% - group_by(harp_12m, default_12m, hard = F) %>% - summarise(n = n()) %>% - collect(parallel = T) %>% - group_by(harp_12m, default_12m) %>% - summarise(n = sum(n))) - -# Source: local data table [4 x 3] -# Groups: harp_12m -# -# # A tibble: 4 x 3 -# harp_12m default_12m n -# -# 1 NA NA 1765366501 -# 2 TRUE NA 12343954 -# 3 NA TRUE 22366382 -# 4 TRUE TRUE 638 - -# how many accounts eventually default ------------------------------------ -#future::plan(multiprocess(workers = availableCores())) -pt = proc.time() # 4:26 -simple2 = fmdf_all %>% - srckeep(c("oltv", "default_12m", "harp_12m", "loan_id", "orig_dte", "delq.status")) %>% - #filter(delq.status %in% c("0","X","1","2")) %>% - mutate(orig_yr = substr(orig_dte, 4,7)) %>% - group_by(oltv, loan_id, orig_yr, hard = F) %>% - summarise(ndef = sum(default_12m, na.rm=T), n = n()) %>% - collect(parallel = T) %>% - group_by(oltv, loan_id, orig_yr) %>% - summarise(ndef = sum(ndef), n = sum(n)) -timetaken(pt) - -stopifnot(simple2[,n_distinct(loan_id) == .N]) - -#simple3 = simple2[,.(ndef = sum(ndef > 0), .N), .(orig_yr_band = cut(as.numeric(orig_yr),c(-Inf, 2005, 2007, 2008,2009,Inf)), oltv_band = ceiling(oltv/10)*10)] - -simple3 = simple2[,.(ndef = sum(ndef > 0), .N), .(orig_yr, oltv_band = cut(oltv, c(-Inf, seq(0,80,by=20),Inf)))] - -simple3 %>% - mutate(odr = ndef/N) %>% - select(oltv_band, odr, orig_yr) %>% - spread(key = oltv_band, value = odr) - -simple3 %>% - filter(!is.na(oltv_band)) %>% - mutate(odr = ndef/N) %>% - arrange(oltv_band) %>% - ggplot + - geom_bar( - aes( - x = as.factor(oltv_band), - weight = odr, - colour = orig_yr), - position = "dodge") - - - - - - - diff --git a/tutorials/fannie_mae_new/4c_explore_xgboost.r b/tutorials/fannie_mae_new/4c_explore_xgboost.r deleted file mode 100644 index f6a53cb..0000000 --- a/tutorials/fannie_mae_new/4c_explore_xgboost.r +++ /dev/null @@ -1,44 +0,0 @@ -source("inst/fannie_mae/0_setup.r") -library(disk.frame) - -vars_to_keep = c( - Performance_Variables[1:15] %>% tolower, - Acquisitions_Variables[-1] %>% tolower, - c("default_12m", "harp_12m") -) %>% unique - -# create a new variable dh12m which is a concatenation of default in next 12 months or going into harp next 12 months -fmdf_all = disk.frame(file.path(outpath, "fmdf_appl")) %>% - srckeep(vars_to_keep) %>% - delayed(~{ - .x[,dh12m := default_12m | harp_12m] - .x[is.na(dh12m), dh12m:=F] - .x - }) - -fmdf1 = disk.frame(file.path(outpath, "fmdf_appl")) - -# took 2 mins -system.time(uid <- unique(fmdf1[substr(orig_dte ,4,7) >= "2014",unique(loan_id), keep=c("loan_id","orig_dte")])) - -# set.seed(1) -# suid = sample(uid, length(uid)/100) - -uiddf = data.table(loan_id = uid) -setkey(uiddf, loan_id) - -#fmdf_all = disk.frame(file.path(outpath, "fmdf_appl")) -# took about 10 minutes -system.time(fmdf_2yr <- fmdf_all %>% - map.disk.frame(~{ - setkey(.x, loan_id) - merge(.x, uiddf, by="loan_id") - }, - lazy = F, - outdir = file.path(outpath,"fmdf_2yr"), - overwrite = T)) - -# fmdf_2yr = disk.frame(file.path(outpath,"fmdf_2yr")) -system.time(sfmdf_2yr <- sample_frac(fmdf_2yr, 1) %>% collect(parallel = F)) - - diff --git a/tutorials/fannie_mae_new/4d_xgboost.r b/tutorials/fannie_mae_new/4d_xgboost.r deleted file mode 100644 index 248e676..0000000 --- a/tutorials/fannie_mae_new/4d_xgboost.r +++ /dev/null @@ -1,151 +0,0 @@ -source("inst/fannie_mae/0_setup.r") -library(disk.frame) -fmdf_2yr = disk.frame(file.path(outpath,"fmdf_2yr")) - -system.time(xy <- fmdf_2yr[,.(dh12m, oltv),keep=c("dh12m","oltv")]) - - -# show how to binning ----------------------------------------------------- -system.time(xy <- fmdf_2yr[,c("dh12m", "oltv"), keep=c("dh12m","oltv")]) - -dtrain <- xgb.DMatrix(label = xy$dh12m, data = as.matrix(xy[,"oltv"])) - -pt = proc.time() -m <- xgboost( - data=dtrain, - nrounds = 1, - objective = "binary:logitraw", - tree_method="exact", - monotone_constraints = 1, - base_score = 32662/10067411) -timetaken(pt) - - -map_chr(xgb.dump(m), ~str_extract(.x,"\\[f0<[\\d]+\\.[\\d]+\\]")) %>% - keep(~!is.na(.x)) %>% - map_dbl(~str_extract(.x, "[\\d]+\\.[\\d]+") %>% as.numeric) %>% - sort %>% - floor -> xy[,"oltv"] - -bb = xy[,sum(dh12m)/.N, .(bins = cut(oltv,c(-Inf,bins,Inf)))] - -setkey(bb, bins) - -barplot(height = bb$V1, names.arg = bb$bins) - -pt = proc.time() -m2 <- xgboost( - data=x, - label = y, - nrounds = 3, - objective = "binary:logitraw", - tree_method="exact", - monotone_constraints = 1, - base_score = 32662/10067411) -timetaken(pt) - - -map_chr(xgb.dump(m2), ~str_extract(.x,"\\[f0<[\\d]+\\.[\\d]+\\]")) %>% - keep(~!is.na(.x)) %>% - map_dbl(~str_extract(.x, "[\\d]+\\.[\\d]+") %>% as.numeric) %>% - sort %>% - floor %>% - unique -> bins - -bb = xy[,sum(dh12m)/.N, .(bins = cut(oltv,c(-Inf,bins,Inf)))] - -setkey(bb, bins) - -barplot(height = bb$V1, names.arg = bb$bins) - -print(length(bins)) - -xgb.plot.tree(model = m) - - -# with penalty ------------------------------------------------------------ -system.time(xy <- fmdf_2yr[,.(dh12m, oltv),keep=c("dh12m","oltv")]) - -dtrain <- xgb.DMatrix(label = xy$dh12m, data = as.matrix(xy$oltv)) - -system.time(xycv <- - xgb.cv(dtrain, objective="binary:logitraw", nfold = 5, nrounds=1)) -xycv - -pt = proc.time() -m <- xgboost( - data=dtrain, - nrounds = 1, - objective = "binary:logitraw", - tree_method="exact", - monotone_constraints = 1, - base_score = 32662/10067411 -) -timetaken(pt) - - -map_chr(xgb.dump(m), ~str_extract(.x,"\\[f0<[\\d]+\\.[\\d]+\\]")) %>% - keep(~!is.na(.x)) %>% - map_dbl(~str_extract(.x, "[\\d]+\\.[\\d]+") %>% as.numeric) %>% - sort %>% - floor -> bins - -bb = xy[,sum(dh12m)/.N, .(bins = cut(oltv,c(-Inf,bins,Inf)))] - -setkey(bb, bins) - -barplot(height = bb$V1, names.arg = bb$bins) - -xgb.plot.tree(model = m) - -# do a variable with NA --------------------------------------------------- -system.time(xy <- fmdf_2yr[,.(dh12m, mi_pct),keep=c("dh12m","mi_pct")]) - -dtrain <- xgb.DMatrix(label = xy$dh12m, data = as.matrix(xy$mi_pct)) - -pt = proc.time() -m <- xgboost( - data= dtrain, - nrounds = 1, - objective = "binary:logitraw", - tree_method="exact", - monotone_constraints = 1, - base_score = 32662/10067411) -timetaken(pt) - - -map_chr(xgb.dump(m), ~str_extract(.x,"\\[f0<[\\d]+\\.[\\d]+\\]")) %>% - keep(~!is.na(.x)) %>% - map_dbl(~str_extract(.x, "[\\d]+[\\.[\\d]+]") %>% as.numeric) %>% - sort %>% - floor %>% - unique -> bins - -bb = xy[,sum(dh12m)/.N, .(bins = cut(mi_pct,c(-Inf,bins,Inf)) %>% addNA)] - -setkey(bb, bins) - -barplot(height = bb$V1, names.arg = bb$bins) - - -# do other stuff ---------------------------------------------------------- - - -# took 142 -a = select_if(as.tibble(sfmdf_2yr), is.numeric) %>% - select(-c(mi_type)) - -system.time(x <- as.matrix(a)) - - -dtrain <- xgb.DMatrix(label = sfmdf_2yr$dh12m, data = x) - - -xgboost( - data=x, - label = sfmdf_2yr$dh12m, - nrounds = 2, - objective = "binary:logitraw", - tree_method="exact") - - diff --git a/tutorials/fannie_mae_new/5a_appl_model.r b/tutorials/fannie_mae_new/5a_appl_model.r deleted file mode 100644 index fb08867..0000000 --- a/tutorials/fannie_mae_new/5a_appl_model.r +++ /dev/null @@ -1,53 +0,0 @@ -source("inst/fannie_mae/0_setup.r") -library(disk.frame) - -defaults = disk.frame(file.path(outpath,"defaults.df")) %>% - srckeep(c("loan_id", "start_date")) %>% - collect - -defaults[,first_default_date := start_date] -month(defaults$first_default_date) <- month(defaults$first_default_date)+12 - -defaults = defaults[,.(loan_id, first_default_date)] - -acqall = disk.frame(file.path(outpath,"acq_all.df")) - -# 1:14 -pt <- proc.time() -acqall1 <- left_join( - acqall, - defaults, - by = "loan_id") %>% - delayed(~{ - library(lubridate) - .x[ - !is.na(first_default_date), - mths_to_1st_default := interval( - as.Date(paste0("01/", frst_dte),"%d/%m/%Y"), - first_default_date) - %/% months(1)] - - .x[,default_next_12m := F] - .x[!is.na(first_default_date), default_next_12m := mths_to_1st_default <= 12] - - res = .x[substr(frst_dte,4,7) < 2016, ] - - res - }) %>% - compute(outdir = file.path(outpath, "appl_mdl_data"), overwrite = T) -timetaken(pt) - -# default rate by year of origination -system.time(drbyyr <- acqall1 %>% - srckeep(c("frst_dte", "default_next_12m")) %>% - mutate(frst_yr = substr(frst_dte, 4, 7)) %>% - group_by(frst_yr, hard = F) %>% - summarise(ndef = sum(default_next_12m, na.rm=T), n = n()) %>% - collect(parallel = T) %>% - group_by(frst_yr) %>% - summarise(ndef = sum(ndef), n = sum(n)) %>% - mutate(odr = ndef/n)) - -drbyyr %>% - ggplot + - geom_line(aes(x = frst_yr %>% as.numeric, y = odr)) diff --git a/tutorials/fannie_mae_new/5b_one_var.r b/tutorials/fannie_mae_new/5b_one_var.r deleted file mode 100644 index 8b796e2..0000000 --- a/tutorials/fannie_mae_new/5b_one_var.r +++ /dev/null @@ -1,189 +0,0 @@ -source("inst/fannie_mae/0_setup.r") -library(disk.frame) - -acqall1 = disk.frame(file.path(outpath, "appl_mdl_data")) - -# XGBoost on one var ------------------------------------------------------ -system.time(xy <- acqall1[,c("default_next_12m", "oltv"), keep=c("default_next_12m", "oltv")]) - -# transform into format accepted by XGBoost -dtrain <- xgb.DMatrix(label = xy$default_next_12m, data = as.matrix(xy[,"oltv"])) - -portfolio_default_rate = xy[,sum(default_next_12m, na.rm = T)/.N] - -pt = proc.time() -m <- xgboost( - data=dtrain, - nrounds = 1, - objective = "binary:logitraw", - tree_method="exact", - monotone_constraints = 1, - base_score = portfolio_default_rate) -timetaken(pt) - -prev_pred = predict(m, dtrain) - -map_chr(xgb.dump(m), ~str_extract(.x,"\\[f0<[\\d]+\\.[\\d]+\\]")) %>% - keep(~!is.na(.x)) %>% - map_dbl(~str_extract(.x, "[\\d]+\\.[\\d]+") %>% as.numeric) %>% - sort %>% - floor -> bins - -bb = xy[,.(ndef = sum(default_next_12m), .N, m1 = min(oltv), m2 = max(oltv)), .(bins = cut(oltv,c(-Inf,bins,Inf)))] -new_bins = sort(unique( bb$m2)) -bb = xy[,.(ndef = sum(default_next_12m), .N), .(bins = cut(oltv,c(-Inf,new_bins,Inf)))][order(bins)] - -setkey(bb, bins) -bb %>% - filter(!is.na(bins)) %>% - mutate(`Orig LTV Band` = bins, `Default Rate%` = ndef/N) %>% - ggplot + - geom_bar(aes(x = `Orig LTV Band`, y = `Default Rate%`), stat = 'identity') + - coord_flip() - -if(F) { - system.time(xyz <- acqall1[,c("default_next_12m", "oltv", "frst_dte"), keep=c("default_next_12m", "oltv", "frst_dte")]) - - xyz[,frst_yr := substr(frst_dte,4,7) %>% as.integer] - xyz[,frst_dte:=NULL] - - bb = xyz[,.(ndef = sum(default_next_12m), .N), .(bins = cut(oltv,c(-Inf,bins,Inf)), frst_yr)] - - bb[,binton := sum(N), bins] - bb[,dr := ndef/binton] - - setkey(bb, bins) - bb[order(frst_yr,decreasing = T),text_y_pos := cumsum(dr) - dr/2, bins] - bb[,text := substr(frst_yr, 3,4)] - - bb %>% - filter(!is.na(bins)) %>% - mutate(`Yr of Orig` = as.factor(frst_yr), `Orig LTV Band` = bins, `Default Rate%` = dr) %>% - ggplot + - geom_bar(aes(x = `Orig LTV Band`, y = `Default Rate%`, fill = `Yr of Orig`), stat = 'identity') + - geom_text(aes(x = `Orig LTV Band`, y = text_y_pos, label = text)) + - coord_flip() -} - - -# what if we fit them in rounded number --------------------------------------------------------- - -xy[oltv > 80, oltv_round := ceiling(oltv/5)*5] -xy[oltv <= 80, oltv_round := ceiling(oltv/10)*10] -xy[oltv <= 40, oltv_round := ceiling(oltv/20)*20] -#xy = xy[!is.na(oltv_round),] - -xy[is.na(oltv_round),] - -xy[,.N, oltv_round] - -# transform into format accepted by XGBoost -dtrain <- xgb.DMatrix(label = xy$default_next_12m, data = as.matrix(xy[,"oltv_round"])) - -pt = proc.time() -m <- xgboost( - data=dtrain, - nrounds = 1, - objective = "binary:logitraw", - tree_method="exact", - monotone_constraints = 1, - base_score = portfolio_default_rate) -timetaken(pt) - -map_chr(xgb.dump(m), ~str_extract(.x,"\\[f0<[\\d]+[\\.]{0,1}[\\d]+\\]")) %>% - keep(~!is.na(.x)) %>% - map_dbl(~str_extract(.x, "[\\d]+[\\.]{0,1}[\\d]+") %>% as.numeric) %>% - sort -> bins - -bb = xy[,.(ndef = sum(default_next_12m), .N, m1 = min(oltv_round), m2 = max(oltv_round)), .(bins = cut(oltv_round,c(-Inf,bins,Inf)))] -new_bins = sort(unique( bb$m2)) -bb = xy[,.(ndef = sum(default_next_12m), .N), .(bins = cut(oltv_round,c(-Inf,new_bins,Inf)))][order(bins)] -bb[,odr := ndef/N] - -setkey(bb, bins) -bb %>% - filter(!is.na(bins)) %>% - mutate(`Orig LTV Band` = bins, `Default Rate%` = ndef/N) %>% - ggplot + - geom_bar(aes(x = `Orig LTV Band`, y = `Default Rate%`), stat = 'identity') + - coord_flip() - - -prev_pred = predict(m, dtrain) - -prev_pred1 = predict(m, dtrain, predcontrib=T) - -# add another variable ---------------------------------------------------- -target = "default_next_12m" -feature = "orig_amt" -df = acqall1 -format_fn = base::I -existing_model = prev_pred -monotone_constraints = -1 - -auc <- function(target, score) { - df = data.table(target, score) - - df1 = df[,.(nt = sum(target), n = .N, score)] - setkey(df1, score) -} - -add_var_to_scorecard <- function(df, target, feature, monotone_constraints = 0, prev_pred = NULL, format_fn = base::I) { - - xy = df %>% - srckeep(c(target, feature)) %>% - collect(parallel = T) - - # evaluate - code = glue::glue("xy = xy %>% mutate({feature} = format_fn({feature}))") - eval(parse(text = code)) - - dtrain <- xgb.DMatrix(label = xy[,target, with = F][[1]], data = as.matrix(xy[,c(feature), with = F])) - - if(is.null(prev_pred)) { - pt = proc.time() - m2 <- xgboost( - data=dtrain, - nrounds = 1, - objective = "binary:logitraw", - tree_method="exact", - monotone_constraints = monotone_constraints - ) - timetaken(pt) - } else { - setinfo(dtrain, "base_margin", prev_pred) - pt = proc.time() - m2 <- xgboost( - data=dtrain, - nrounds = 1, - objective = "binary:logitraw", - tree_method="exact", - monotone_constraints = monotone_constraints - ) - timetaken(pt) - - a2 = predict(m2, dtrain) - a3 = predict(m2, dtrain, predcontrib = T) - } - - map_chr(xgb.dump(m2), ~str_extract(.x,"\\[f0<[\\d]+[\\.]{0,1}[\\d]+\\]")) %>% - keep(~!is.na(.x)) %>% - map_dbl(~str_extract(.x, "[\\d]+[\\.]{0,1}[\\d]+") %>% as.numeric) %>% - sort -> bins - - code = glue::glue("bb = xy[,.(ndef = sum({target}), .N, m1 = min({feature}), m2 = max({feature})), .(bins = cut({feature},c(-Inf,bins,Inf)))]") - eval(parse(text = code)) - - new_bins = sort(unique(bb$m2)) - code1 = glue::glue("bb = xy[,.(ndef = sum(default_next_12m), .N), .(bins = cut({feature},c(-Inf,new_bins,Inf)))][order(bins)]") - eval(parse(text = code1)) - - setkey(bb, bins) - bb %>% - filter(!is.na(bins)) %>% - mutate(`Orig LTV Band` = bins, `Default Rate%` = ndef/N) %>% - ggplot + - geom_bar(aes(x = `Orig LTV Band`, y = `Default Rate%`), stat = 'identity') + - coord_flip() -} - diff --git a/tutorials/fannie_mae_new/5c_using_a_fn.r b/tutorials/fannie_mae_new/5c_using_a_fn.r deleted file mode 100644 index e083fa5..0000000 --- a/tutorials/fannie_mae_new/5c_using_a_fn.r +++ /dev/null @@ -1,54 +0,0 @@ -source("inst/fannie_mae/0_setup.r") -library(disk.frame) - -acqall1 = disk.frame(file.path(outpath, "appl_mdl_data")) - -pt <- proc.time() -firstm <- add_var_to_scorecard(acqall1, "default_next_12m", "oltv", monotone_constraints = 1, format_fn = function(v) { - ceiling(v / 5) * 5 -}) -timetaken(pt) - -# now add the second variable --------------------------------------------- -pt <- proc.time() -secondm <- add_var_to_scorecard( - acqall1, - "default_next_12m", - "dti", - prev_pred = firstm$prev_pred, - monotone_constraints = 1) -timetaken(pt) - -pt <- proc.time() -thirdm <- add_var_to_scorecard( - acqall1, - "default_next_12m", - "orig_trm", - prev_pred = secondm$prev_pred, - monotone_constraints = 1) -timetaken(pt) - -pt <- proc.time() -fourthm <- add_var_to_scorecard( - acqall1, - "default_next_12m", - "num_bo", - prev_pred = thirdm$prev_pred, - monotone_constraints = -1, - format_fn = as.numeric) -timetaken(pt) - -pt <- proc.time() -fifthm <- add_var_to_scorecard( - acqall1, - "default_next_12m", - "cscore_b", - prev_pred = fourthm$prev_pred, - monotone_constraints = -1) -timetaken(pt) - - -# plot improvement -------------------------------------------------------- -plot(map_dbl(list(firstm, secondm, thirdm, fourthm, fifthm), ~.x$auc), type="b", ylim = c(0.4,0.9)) -abline(h = 0.7, lty = 2) - diff --git a/tutorials/fannie_mae_new/5d_AutoML.r b/tutorials/fannie_mae_new/5d_AutoML.r deleted file mode 100644 index 7525c87..0000000 --- a/tutorials/fannie_mae_new/5d_AutoML.r +++ /dev/null @@ -1,34 +0,0 @@ -source("inst/fannie_mae/0_setup.r") -library(disk.frame) -acqall = disk.frame(file.path(outpath, "appl_mdl_data")) - -pt = proc.time() -acqall_all = acqall %>% - map.disk.frame(~{ - defs = .x[default_next_12m == T,] - defs[,weight:=1] - non_defs = .x[default_next_12m == F, ] - non_defs[,weight:=10] - - rbindlist(list(defs, sample_frac(non_defs, 0.1)), fill = T, use.names = T) - }, lazy = F, outdir = file.path(outpath, "appl_mdl_data_sampled"), overwrite = T) -timetaken(pt) - -#acqall_all = disk.frame(file.path(outpath, "appl_mdl_data_sampled")) - -acqall_dev = sample_frac(acqall_all, 0.7) %>% - write_disk.frame(file.path(outpath, "appl_mdl_data_sampled_dev")) - -#acqall_dev = disk.frame(file.path(outpath, "appl_mdl_data_sampled_dev")) - - -uid_dev = acqall_dev %>% - srckeep("loan_id") %>% - mutate(loan_id = unique(loan_id)) %>% - collect(parallel = T) %>% - mutate(loan_id = unique(loan_id)) - - -acqall_val = acqall_all %>% - anti_join(uid_dev, by = "loan_id") %>% - write_disk.frame(file.path(outpath, "appl_mdl_data_sampled_val")) diff --git a/tutorials/fannie_mae_new/5e_AutoML_actual.r b/tutorials/fannie_mae_new/5e_AutoML_actual.r deleted file mode 100644 index 9e7db97..0000000 --- a/tutorials/fannie_mae_new/5e_AutoML_actual.r +++ /dev/null @@ -1,90 +0,0 @@ -source("inst/fannie_mae/0_setup.r") -library(disk.frame) - -acqall_dev = disk.frame(file.path(outpath, "appl_mdl_data_sampled_dev2")) - - -# 5d develop a function to test all variables -check_which_is_best <- function(df, target, features, monotone_constraints, format_fns, weight=NULL) { - #browser() - prev_pred = NULL - ws = NULL - vars_scr = NULL - - while(length(features) > 0) { - # try every var - #browser() - res = furrr::future_map(1:length(features), ~add_var_to_scorecard( - #res = purrr::map(1:length(features), ~add_var_to_scorecard( - df, - target, - features[.x], - monotone_constraints = monotone_constraints[.x], - prev_pred = prev_pred, - format_fn = format_fns[[.x]], - weight, - save_model_fname = glue::glue("{features[.x]}.xgbm") - )) - - # which has the best auc - w = which.max(map_dbl(res, ~.x$auc)) - ws = c(ws, w) - feature_to_fit = features[w] - print(glue::glue("choosen: {feature_to_fit}")) - - var_scr1 = res[[w]] - eval(parse(text = glue::glue("vars_scr = c(vars_scr, list({feature_to_fit} = var_scr1))"))) - - features = features[-w] - format_fns = format_fns[-w] - monotone_constraints = monotone_constraints[-w] - - prev_pred = var_scr1$prev_pred - } - vars_scr -} - -num_vars = c("mi_pct", "orig_amt", "orig_rt", "ocltv", "mi_type", "cscore_c", "oltv", "dti", "orig_trm", "cscore_b", "num_bo", "num_unit", "orig_dte", "frst_dte") -num_vars_mon = c(0 , 1 , 1 , 1 , 0 , -1 , 1 , 1 , 1 , -1 , -1 , 0 , 0 , 0) -num_var_fmt_fn = c(map(1:10, ~base::I), map(1:2, ~as.numeric), map(1:2, ~function(x) { - as.numeric(substr(x, 4, 7)) -})) - -cat_vars = setdiff(names(acqall_dev), num_vars) %>% - setdiff( - c("loan_id", "default_next_12m", "first_default_date", "mths_to_1st_default", "zip_3", "weight")) - -pt = proc.time() -res_all = check_which_is_best( - acqall_dev, - "default_next_12m", - features = c(num_vars, cat_vars), - monotone_constraints = c(num_vars_mon, rep(0, length(cat_vars))), - format_fns = c(num_var_fmt_fn, map(1:length(cat_vars), ~base::I)), - weight = "weight" -) -timetaken(pt) - -plot(map_dbl(res_all, ~.x$auc)) - -saveRDS(res_all, "model.rds") - -if(F) { - rescat = check_which_is_best( - acqall_dev, - "default_next_12m", - c("seller.name", "oltv", "state", "dti", "mi_type"), - c(0 , 1 , 0 , 1 , 0 ), - map(1:5, ~base::I) - ) - - - system.time(res <- check_which_is_best( - acqall1, - "default_next_12m", - c("ocltv", "cscore_c", "mi_type", "oltv", "dti", "orig_trm", "cscore_b", "num_bo"), - c(1 , -1 , 0 , 1 , 1 , 1 , -1 , -1 ), - c(map(1:7, ~base::I), list(as.numeric)))) -} - - diff --git a/tutorials/fannie_mae_new/5f_make_scorecard_visible.R b/tutorials/fannie_mae_new/5f_make_scorecard_visible.R deleted file mode 100644 index 2d696a2..0000000 --- a/tutorials/fannie_mae_new/5f_make_scorecard_visible.R +++ /dev/null @@ -1,94 +0,0 @@ -source("inst/fannie_mae/0_setup.r") -library(disk.frame) - -acqall_val = disk.frame(file.path(outpath, "appl_mdl_data_sampled_val")) - -#the model -mdl = readRDS("model.rds") - -plot(purrr::map_dbl(mdl, ~.x$auc)) - -#' score one variable -score_one_var <- function(feature, x, bins, bias, format_fn = base::I) { - print(feature) - # numeric features (post format_fn) has only two columns in bins - if(ncol(bins) == 2) { - x1 = format_fn(x) - binsNA = evalparseglue("bins[is.na({feature}),]") - bins = evalparseglue("bins[!is.na({feature}),]") - score = bins$score[cut(x1, c(-Inf, bins[[2]], Inf))] - - if(nrow(binsNA) == 1) { - score[is.na(x1)] = binsNA[[1]] - } - } else { - x1 = format_fn(x) - x1df = evalparseglue("data.table({feature} = x1)") - x1df_scr = evalparseglue("merge(x1df, bins, by = '{feature}', all.x=T)") - score = x1df_scr$score - } - # if there are still na then assign the bias - #browser(expr = any(is.na(score))) - if(any(is.na(score))) { - warning(glue::glue("the feature `{feature}` has {sum(is.na(score))} unassigned scores out of {length(score)}, assigning them the bias = {bias}")) - score[is.na(score)] = bias - } - - score -} - - -#' Socre a scorecard -#' @param mdl the xg scorecard model -#' @param df a disk.frame -score_xg_scorecard <- function(df, mdl) { - res = furrr::future_map_dfc(mdl, ~{ - #res = purrr::map_dfc(mdl, ~{ - var = df %>% - srckeep(.x$feature) %>% - collect(parallel = F) - x = var[[1]] - feature = .x$feature - bins = .x$bins - bias = .x$bias - format_fn = .x$format_fn - score_one_var(feature, x, bins, bias, format_fn) - }) %>% rowSums - res -} - -plot_auc <- function(df) { - target = df %>% - srckeep("default_next_12m") %>% - collect(parallel = F) - - pt = proc.time() - target$score = score_xg_scorecard(df, mdl) - timetaken(pt) - - AUC = auc(target$default_next_12m, target$score) - GINI = 2*AUC-1 - - target[,negscore := -score] - setkey(target, negscore) - cts = target[,.(ctot = (1:.N)/.N, cbad = cumsum(default_next_12m)/sum(default_next_12m))] - - plot(cts[seq(1,.N, length.out=100),], type="l", xlim=c(0,1), - main=glue::glue("On Validation - AUC: {AUC %>% round(2)}; GINI: {GINI %>% round(2)}")) - abline(a=0,b=1,lty=2) - abline(h=0) - abline(v=0) -} - -system.time(plot_auc(acqall_val)) - -# score on whole ---------------------------------------------------------- -acqall_dev = disk.frame(file.path(outpath, "appl_mdl_data_sampled_dev")) -system.time(plot_auc(acqall_dev)) - - -# score on whole ---------------------------------------------------------- -acqall = disk.frame(file.path(outpath, "appl_mdl_data")) -system.time(plot_auc(acqall)) - - diff --git a/tutorials/fannie_mae_new/6a_origination_year.r b/tutorials/fannie_mae_new/6a_origination_year.r deleted file mode 100644 index 8115a78..0000000 --- a/tutorials/fannie_mae_new/6a_origination_year.r +++ /dev/null @@ -1,31 +0,0 @@ - -acqall2 = acqall1 %>% - srckeep(c("orig_dte", "first_default_date")) %>% - mutate(orig_yr = substr(orig_dte, 4,7), yr_1st_d = year(first_default_date)) %>% - group_by(orig_yr, yr_1st_d, hard = F) %>% - summarise(n=n()) %>% - collect %>% - group_by(orig_yr, yr_1st_d) %>% - summarise(n = sum(n)) - -acqall2[,tot_n := sum(n), orig_yr] - -acqall3 <- acqall2[!is.na(yr_1st_d),] - -acqall3[,dr := n/tot_n] - -acqall3 %>% - filter(orig_yr > 1999) %>% - mutate(`Origination Year` = orig_yr) %>% - ggplot + - geom_line(aes(x = yr_1st_d, y = dr, colour = `Origination Year`)) + - xlab("Year of Observation") + - ylab("Ratio of defaulted accounts vs # of accts at orig") + - scale_x_continuous(breaks=2000:2017, labels=as.character(2000:2017)) + - scale_y_continuous(expand = c(0, 0)) + - ggtitle("Fannie Mae Single Family Loans: Ratio of defaults vs # of accounts in same year of origination") - - - - - diff --git a/tutorials/fannie_mae_new/7a_speedglm.r b/tutorials/fannie_mae_new/7a_speedglm.r deleted file mode 100644 index 3c3fe0c..0000000 --- a/tutorials/fannie_mae_new/7a_speedglm.r +++ /dev/null @@ -1,75 +0,0 @@ -source("inst/fannie_mae/0_setup.r") -library(disk.frame) - -acqall_dev = disk.frame(file.path(outpath, "appl_mdl_data_sampled_dev2")) - -library(speedglm) - -head(acqall_dev) - -#' A streaming function for speedglm -#' @param df a disk.frame -stream_shglm <- function(df) { - i = 0 - is = sample(nchunks(df), replace = F) - function(reset = F) { - #browser() - if(reset) { - print("you've reset") - i <<- 0 - } else { - i <<- i + 1 - #if (i > 4) { - if (i > nchunks(df)) { - return(NULL) - } - print(glue::glue("{i}/{nchunks(df)}")) - return(get_chunk(df, is[i])) - #return(get_chunk(df, i)) - } - } -} - -acqall_dev1 = acqall_dev %>% delayed(~{ - .x[,oltv_band := cut(oltv, c(-Inf, seq(0,100,by=10)))] - .x[,dti_band := addNA(cut(dti, c(-Inf, seq(0,100,by=10), Inf)))] - .x -}) - -library(speedglm) -library(biglm) - -shglm(default_next_12m ~ oltv_band-1, - datafun = streamacq, family=binomial()) - -bigglm(default_next_12m ~ oltv_band-1, data = streamacq, family=binomial()) - -make.data<-function(filename, chunksize,...){ - conn<-NULL - function(reset=FALSE){ - if(reset){ - if(!is.null(conn)) - close(conn) - conn<<-file(filename,open="r")} else{ - rval<-read.table(conn, nrows=chunksize,...) - if ((nrow(rval)==0)) { - close(conn) - conn<<-NULL - rval<-NULL} - return(rval)}}} -# data1 is a small toy dataset -data(data1) -write.table(data1,"data1.txt",row.names=FALSE,col.names=FALSE) -rm(data1) -da<-make.data("data1.txt",chunksize=50,col.names=c("y","fat1","x1","x2")) -# Caution! make sure to close the connection once you have run command #1 -da(reset=T) #1: opens the connection to "data1.txt" -da(reset=F) #2: reads the first 50 rows (out of 100) of the dataset -da(reset=F) #3: reads the second 50 rows (out of 100) of the dataset -da(reset=F) #4: is NULL: this latter command closes the connectionrequire(biglm)# fat1 is a factor with four levels - -b1<-shglm(y~factor(fat1)+x1,weights=~I(x2^2),datafun=da,family=Gamma(log)) -b2<-bigglm(y~factor(fat1)+x1,weights=~I(x2^2),data=da,family=Gamma(log)) -summary(b1) -summary(b2) - diff --git a/tutorials/fannie_mae_new/8a_keras.r b/tutorials/fannie_mae_new/8a_keras.r deleted file mode 100644 index 1fc7dcd..0000000 --- a/tutorials/fannie_mae_new/8a_keras.r +++ /dev/null @@ -1,153 +0,0 @@ -source("inst/fannie_mae/0_setup.r") -library(disk.frame) -library(keras) - -acqall_dev = disk.frame(file.path(outpath, "appl_mdl_data_sampled_dev")) - -#' A streaming function for speedglm -#' @param df a disk.frame -stream_shglm <- function(df) { - i = 0 - is = sample(nchunks(df), replace = F) - function(reset = F) { - #browser() - if(reset) { - print("you've reset") - i <<- 0 - } else { - i <<- i + 1 - #if (i > 4) { - if (i > nchunks(df)) { - return(NULL) - } - print(glue::glue("{i}/{nchunks(df)}")) - return(get_chunk(df, is[i])) - #return(get_chunk(df, i)) - } - } -} - -acqall_dev1 = acqall_dev %>% delayed(~{ - .x[,oltv_band := cut(oltv, c(-Inf, seq(40,100,by=20)))] - .x[,dti_band := addNA(cut(dti, c(-Inf, seq(0,64,by=8))))] - .x -}) - - -build_model <- function() { - model <- keras_model_sequential() %>% - layer_dense(units = 2, activation = 'softmax') - - - model %>% compile( - loss = "categorical_crossentropy", - optimizer = 'sgd' - ) - - model -} - -model = build_model() - -ol = levels(acqall_dev1[,oltv_band, keep=c("oltv","dti")]) -dl = levels(acqall_dev1[,dti_band, keep=c("oltv","dti")]) - -#for(i in 1:nchunks(acqall_dev1)) { -kk <- function() { - j = 0 - done = F - ii = 0 - while(!done) { - j <- j + 1 - si = sample(nchunks(acqall_dev1), nchunks(acqall_dev1)*0.7) - osi = setdiff(1:nchunks(acqall_dev1), si) - #browser() - system.time(a <- map_dfr(si, ~{ - ii <- ii + 1 - i = .x - if(ii %% 20 == 0) print(glue::glue("{j}:{ii} {Sys.time()}")) - - a = get_chunk(acqall_dev1, i) - a[,.(sum(default_next_12m), .N),oltv_band] - - a1 = - cbind( - a[,keras::to_categorical(as.integer(oltv_band) - 1)]#, - #a[,keras::to_categorical(as.integer(dti_band) - 1)] - ) - - at = a[,default_next_12m*1] - Y_train = keras::to_categorical(at) - - hist = model %>% fit( - a1, - Y_train, - epochs = 1, - validation_split = 0.2, - verbose = 0 - ) - - gw = get_weights(model) - - gwdt = as.data.table(gw[1]) - setnames(gwdt, names(gwdt), c("non_default", "default")) - gwdt$i =i - gwdt$var = c( - rep("oltv_band", length(ol)) - #, rep("dti_band", length(dl)+1) - ) - gwdt$band = c(ol#,dl - ) - gwdt - })) - - some_chunks <- map(osi, ~{ - i = .x - a = get_chunk(acqall_dev1, i) - - a1 = - cbind( - a[,keras::to_categorical(as.integer(oltv_band) -1)] - #,a[,keras::to_categorical(dti_band %>% as.integer-1)] - ) - a1 - }) %>% reduce(rbind) - - outcomes <- map(osi, ~{ - i = .x - a = get_chunk(acqall_dev, i, keep=c("default_next_12m")) - a[,default_next_12m] - }) %>% unlist - - auc= auc(outcomes, predict(model, some_chunks)[,2]) - - # scores - p = predict(model, diag(4))[,2] - a_b = log(p/(1-p)) - - scalar = 20/log(2) - intercept = 20*log(536870912/15)/log(2) - - a = mean(a_b) - b = a_b - a - done <- length(unique(sign(diff(b)))) == 1 - - scrs = data.table(base_scr = round(a*scalar+intercept,0), scr = round(-b*scalar,0)) - scrs$var = c(rep("oltv_band", length(ol)) - #, rep("dti_band", length(dl)+1) - ) - scrs$band = c(ol - #, dl - ) - scrs = scrs[band != "bias"] - - scrs = scrs[,.(var, band, scr, base_scr)] - print(glue::glue("AUC: {auc}")) - print(scrs) - } -} - -pt = proc.time() -kk() -timetaken(pt) -View(scrs) diff --git a/tutorials/flights-100-1000.r b/tutorials/flights-100-1000.r deleted file mode 100644 index da2f528..0000000 --- a/tutorials/flights-100-1000.r +++ /dev/null @@ -1,49 +0,0 @@ -library(data.table) -library(disk.frame) -setup_disk.frame() - -bench_disk.frame_data.table_group_by <- function(data1,n) { - a.sharded.df = as.disk.frame(data1, shardby = c("year", "month", "day")) - - a.not_sharded.df = as.disk.frame(data1) - - a = copy(data1) - setDT(a) - - data.table_timing = system.time(a[,.(mean_dep_time = mean(dep_time, na.rm=T)), .(year, month, day)])[3] - - disk.frame_sharded_timing = system.time( - a.sharded.df[ - , - .(mean_dep_time = mean(dep_time, na.rm=TRUE)), - .(year, month, day), - keep = c("year", "month","day", "dep_time")])[3] - - - disk.frame_not_sharded_timing = system.time( - a.not_sharded.df[ - , - .( - sum_dep_time = sum(dep_time, na.rm=TRUE), - n = sum(!is.na(dep_time)) - ), - .(year, month, day), - keep = c("year", "month","day", "dep_time")][ - , - .(mean_dep_time = sum(sum_dep_time)/sum(n)), - .(year, month, day) - ])[3] - - barplot( - c(data.table_timing, disk.frame_sharded_timing, disk.frame_not_sharded_timing), - names.arg = c("data.table", "sharded disk.frame", "not sharded disk.frame"), - main = glue:glue("flights duplicated {n} times group-by year, month, day"), - ylab = "Seconds") -} - -system.time(flights_100 <- rbindlist(lapply(1:100, function(x) nycflights13::flights))) - -system.time(flights_1000 <- rbindlist(lapply(1:10, function(x) flights_100))) - -bench_disk.frame_data.table_group_by(flights_100, 100) -bench_disk.frame_data.table_group_by(flights_1000, 1000) diff --git a/tutorials/flights_case_study/0-download-data.sh b/tutorials/flights_case_study/0-download-data.sh deleted file mode 100644 index ba7b135..0000000 --- a/tutorials/flights_case_study/0-download-data.sh +++ /dev/null @@ -1,7 +0,0 @@ -#!/bin/bash - -cd inst/flights_case_study/data/ - -for i in {1987..2008}; do wget -q http://stat-computing.org/dataexpo/2009/"$i".csv.bz2; done - -for i in {1987..2008}; do bunzip2 "$i".csv.bz2; done diff --git a/tutorials/flights_case_study/1-make-disk-frame.r b/tutorials/flights_case_study/1-make-disk-frame.r deleted file mode 100644 index 19e182b..0000000 --- a/tutorials/flights_case_study/1-make-disk-frame.r +++ /dev/null @@ -1,39 +0,0 @@ - library(disk.frame) - - pt = proc.time() - non_shard = 2007:2008 %>% - purrr::map(~ csv_to_disk.frame(paste0("inst/flights_case_study/data/",.x,".csv"), - outdir=paste0("tmp",.x), - overwrite=T)) %>% - rbindlist.disk.frame( - outdir = "inst/data/flights_all_no_shard.df", by_chunk_id = T) - - - sharded = 2007:2008 %>% - purrr::map(~ csv_to_disk.frame(paste0("inst/flights_case_study/data/",.x,".csv"), - outdir=paste0("tmp2",.x), - shardby=c("Year","Month"), - overwrite=T)) %>% - rbindlist.disk.frame( - outdir = "inst/data/flights_all_shard.df", by_chunk_id = T) - - - - nrow(non_shard) - nrow(sharded) - - library(disk.frame) - sharded = disk.frame("inst/data/flights_all_shard.df") - a = sharded[,.N, UniqueCarrier, keep="UniqueCarrier"] - - a = a[,sum(N), UniqueCarrier] - - nosharded = disk.frame("inst/data/flights_all_no_shard.df/") - a1 = nosharded[,.N, UniqueCarrier, keep="UniqueCarrier"] - a1 = a1[,sum(N), UniqueCarrier] - - setkey(a, UniqueCarrier) - setkey(a1, UniqueCarrier) - - identical(a, a1) - diff --git a/tutorials/flights_case_study/2-shard-check.r b/tutorials/flights_case_study/2-shard-check.r deleted file mode 100644 index 5ae36dd..0000000 --- a/tutorials/flights_case_study/2-shard-check.r +++ /dev/null @@ -1,46 +0,0 @@ -library(disk.frame) -plan(multiprocess) -library(magrittr) - -a = list.files("inst/flights_case_study/data", - pattern="*.csv", - full.names = T) %>% - head(10) %>% # only 10 files so that we can test faster - purrr::map(~ csv_to_disk.frame(.x, - outdir=paste0("tmp", readr::parse_number(.x)), - shardby = c("Year","Month"), - overwrite=T)) %>% - rbindlist.disk.frame(outdir = "flights_all_sharded.df", by_chunk_id = T) - - -b = list.files("inst/flights_case_study/data", - pattern="*.csv", - full.names = T) %>% - head(10) %>% # only 10 files so that we can test faster - purrr::map(~ csv_to_disk.frame(.x, - outdir=paste0("tmp", readr::parse_number(.x)), - nchunks = 128, - overwrite=T)) %>% - rbindlist.disk.frame(outdir = "flights_all_notsharded.df", by_chunk_id = T) - - -aa = a[,.N, UniqueCarrier, keep=c("UniqueCarrier")][,sum(N), UniqueCarrier][order(UniqueCarrier)] - -bb = b[,.N, UniqueCarrier, keep=c("UniqueCarrier")][,sum(N), UniqueCarrier][order(UniqueCarrier)] - -identical(aa,bb) - -a2 = a %>% - group_by(UniqueCarrier) %>% - summarise(n=n()) %>% - collect %>% - group_by(UniqueCarrier) %>% - summarise(n=sum(n)) %>% - order_by(UniqueCarrier) - -b2 = b %>% group_by(UniqueCarrier) %>% - summarise(n=count()) %>% - collect %>% - group_by(UniqueCarrier) %>% - summarise(n=sum(n)) - diff --git a/tutorials/flights_case_study/2-shard-check2.r b/tutorials/flights_case_study/2-shard-check2.r deleted file mode 100644 index b4b9716..0000000 --- a/tutorials/flights_case_study/2-shard-check2.r +++ /dev/null @@ -1,31 +0,0 @@ -library(disk.frame) -plan(multiprocess) -library(magrittr) - -a = disk.frame("flights_all_sharded.df") -b = disk.frame("flights_all_notsharded.df") - -a2 = a %>% - group_by(UniqueCarrier) %>% - summarise(n=n()) %>% - collect %>% - group_by(UniqueCarrier) %>% - summarise(n=sum(n)) %>% - arrange(UniqueCarrier) - -b2 = b %>% - group_by(UniqueCarrier) %>% - summarise(n=n()) %>% - collect %>% - group_by(UniqueCarrier) %>% - summarise(n=sum(n)) %>% - arrange(UniqueCarrier) - -identical(a2, b2) - -fa = list.files("inst/flights_case_study/data", - pattern="*.csv", - full.names = T) %>% - head(10) - -ab = csv_to_disk.frame(fa, outdir ="pls_del_tmp", overwrite = T) diff --git a/tutorials/flights_case_study/testing.r b/tutorials/flights_case_study/testing.r deleted file mode 100644 index b47cdee..0000000 --- a/tutorials/flights_case_study/testing.r +++ /dev/null @@ -1,67 +0,0 @@ -library(disk.frame) -library(furrr) -library(magrittr) -plan(multiprocess) - -# fs::dir_delete("tmp22007") -# fs::dir_delete("tmp22008") -# fs::dir_delete("tmp32007") -# fs::dir_delete("tmp32008") -# fs::dir_delete("tmp2007") -# fs::dir_delete("tmp2008") - -pt = proc.time() -full = 1987:2008 %>% - furrr::future_map(~ csv_to_disk.frame(paste0("inst/flights_case_study/data/",.x,".csv"), - outdir=paste0("tmp3",.x), - overwrite=T)) %>% - rbindlist.disk.frame( - outdir = "inst/data/flights_all_no_shard.df", by_chunk_id = T) -print(timetaken(pt)) - -cnames = names(full) - -pt = proc.time() -non_shard = 1987:2008 %>% - furrr::future_map(~ csv_to_disk.frame(paste0("inst/flights_case_study/data/",.x,".csv"), - outdir=paste0("tmp",.x), - in_chunk_size = 3e6, - overwrite=T)) %>% - rbindlist.disk.frame( - outdir = "inst/data/flights_all_no_shard.df", by_chunk_id = T) -print(timetaken(pt)) - -pt = proc.time() -sharded = 1987:2008 %>% - purrr::map(~ csv_to_disk.frame(paste0("inst/flights_case_study/data/",.x,".csv"), - outdir=paste0("tmp2",.x), - shardby=c("Year","Month"), - col.names = cnames, - in_chunk_size = 3e6, - overwrite=T)) %>% - rbindlist.disk.frame( - outdir = "inst/data/flights_all_shard.df", by_chunk_id = T) -print(timetaken(pt)) - -nrow(full) -nrow(non_shard) -nrow(sharded) - -library(disk.frame) -# sharded = disk.frame("inst/data/flights_all_shard.df") -a = sharded[,.N, UniqueCarrier, keep="UniqueCarrier"] - -a = a[,sum(N), UniqueCarrier] - -# non_shard = disk.frame("inst/data/flights_all_no_shard.df/") -a1 = non_shard[,.N, UniqueCarrier, keep="UniqueCarrier"] -a1 = a1[,sum(N), UniqueCarrier] - -setkey(a, UniqueCarrier) -setkey(a1, UniqueCarrier) - -identical(a, a1) - -a2 = merge(a,a1, by="UniqueCarrier") - -a2[,all(V1.x == V1.y)] diff --git a/tutorials/flights_case_study/testing2.r b/tutorials/flights_case_study/testing2.r deleted file mode 100644 index d12f0ae..0000000 --- a/tutorials/flights_case_study/testing2.r +++ /dev/null @@ -1,31 +0,0 @@ -#install.packages("tidyverse") -library(readr) -library(disk.frame) -library(data.table) -library(future) -library(magrittr) -plan(multiprocess) - -pt=proc.time() -a = list.files("inst/flights_case_study/data/", - pattern="*.csv", - full.names = T) %>% - head(10) %>% # only 10 files so that we can test faster - purrr::map(~ csv_to_disk.frame(.x, - outdir=paste0("tmp", readr::parse_number(.x)), - nchunks = 128, - overwrite=T)) %>% - rbindlist.disk.frame(outdir = "flights_all_notsharded.df", by_chunk_id = T) -print(timetaken(pt)) - -pt=proc.time() -b = list.files("inst/flights_case_study/data/", - pattern="*.csv", - full.names = T) %>% - head(10) %>% # only 10 files so that we can test faster - purrr::map(~ csv_to_disk.frame(.x, - outdir=paste0("tmp", readr::parse_number(.x)), - shardby = c("Year","Month"), - overwrite=T)) %>% - rbindlist.disk.frame(outdir = "flights_all_sharded.df", by_chunk_id = T) -print(timetaken(pt)) \ No newline at end of file diff --git a/tutorials/nyc-taxi/nyc-taxi.Rmd b/tutorials/nyc-taxi/nyc-taxi.Rmd deleted file mode 100644 index 658fcc8..0000000 --- a/tutorials/nyc-taxi/nyc-taxi.Rmd +++ /dev/null @@ -1,70 +0,0 @@ ---- -title: "NYC Taxi" -author: "ZJ" -date: "8/12/2019" -output: html_document ---- - -```{r setup, include=FALSE} -knitr::opts_chunk$set(echo = TRUE) -``` - - - -```{r} -library(disk.frame) -setup_disk.frame() -options(future.globals.maxSize = Inf) -``` - -## Loading the data -```{r} -system.time(nyc.df <- csv_to_disk.frame(list.files("c:/data/nyc-taxi-data/",pattern="*.csv", full.names = TRUE))) -``` - -```{r} -sapply(list.files("c:/data/nyc-taxi-data/",pattern="*.csv", full.names = TRUE), function(x) { - ncol(data.table::fread(x, nrows = 2)) -}) -``` - -```{r} -nyc.df = disk.frame("nyc.df") -a = nyc.df[,uniqueN(Dispatching_base_num), keep=names(nyc.df)[1]] -a -``` - - -```{r} -nyc.df = disk.frame("nyc.df") -head(nyc.df) -``` - - -```{r} -library(magrittr) -library(lubridate) -system.time(nyc2.df <- nyc.df %>% - cmap(function(chunk) { - chunk[!is.na(Dispatching_base_number), Dispatching_base_num := Dispatching_base_number] - chunk[, Dispatching_base_number:=NULL] - chunk[, Pickup_DateTime := lubridate::as_datetime(Pickup_DateTime)] - chunk[!is.na(Pickup_date), Pickup_DateTime := lubridate::as_datetime(Pickup_date)] - chunk[,Pickup_date:=NULL] - chunk - }, outdir="nyc2.df", lazy = FALSE, overwrite = TRUE)) -``` - -```{r} -nyc2.df = disk.frame("nyc2.df") -head(nyc2.df) -``` - - -```{r} -system.time(shard(nyc2.df, shardby = "Dispatching_base_num", outdir="nyc.sharded.df")) -``` - -```{r} - -``` \ No newline at end of file diff --git a/tutorials/readme.r b/tutorials/readme.r deleted file mode 100644 index bc7d241..0000000 --- a/tutorials/readme.r +++ /dev/null @@ -1,94 +0,0 @@ -#install.packages(c("fst","future","data.table")) -if(!require(devtools)) install.packages("devtools") -if(!require(disk.frame)) devtools::install_github("xiaodaigh/disk.frame") - -library(dplyr) -library(disk.frame) -library(data.table) -library(fst) -#library(future) -#library(future.apply) -#library(data.table) -nworkers = parallel::detectCores(logical=F) -cat(nworkers," workers\n") -#plan(multiprocess, workers = nworkers, gc = T) -#options(future.globals.maxSize=Inf) - -rows_per_chunk = 1e7 -# generate synthetic data -tmpdir = "tmpfst" -fs::dir_delete(tmpdir) - -# write out 2*nworkers chunks -pt = proc.time() -df = disk.frame(tmpdir) -purrr::walk(1:(nworkers*2), function(ii) { - system.time(ab <- data.table(a = runif(rows_per_chunk), b = runif(rows_per_chunk)) ) #102 seconds - add_chunk(df, ab, ii) -}) -cat("Generating data took: ", timetaken(pt), "\n") - - -# read and output the disk.frame as it to assess "sequential" read-write performance -pt = proc.time() -df2 <- map.disk.frame(df, ~.x, outdir = "tmpfst2", lazy = F, overwrite = T) -cat("Read and write took: ", timetaken(pt), "\n") - -# get first few rows -head(df) - -# get last few rows -tail(df) - -# number of rows -nrow(df) - -# number of columns -ncol(df) - - -# dplyr verbs ------------------------------------------------------------- -df = disk.frame(tmpdir) - -df %>% - summarise(suma = sum(a)) %>% # this does a count per chunk - collect(parallel = T) - -# need a 2nd stage to finalise summing -df %>% - summarise(suma = sum(a)) %>% # this does a count per chunk - collect(parallel = T) %>% - summarise(suma = sum(suma)) - -# filter -pt = proc.time() -system.time(df_filtered <- df %>% - filter(a < 0.1)) -cat("filtering a < 0.1 took: ", timetaken(pt), "\n") -nrow(df_filtered) - -# group by -pt = proc.time() -res1 <- df %>% - filter(b < 0.1) %>% - mutate(blt005 = b < 0.05) %>% - group_by(blt005, hard = T) %>% # hard group_by is slower but avoid a 2nd stage aggregation - summarise(suma = sum(a), n = n()) %>% - collect(parallel = T) -cat("group by took: ", timetaken(pt), "\n") - - -# keep only one var is faster -pt = proc.time() -res1 <- df %>% - srckeep("a") %>% #keeping only the column `a` from the input - summarise(suma = sum(a), n = n()) %>% - collect(parallel = T) -cat("summarise keeping only one column ", timetaken(pt), "\n") - -# same operation without keeping -pt = proc.time() -res1 <- df %>% - summarise(suma = sum(a), n = n()) %>% - collect(parallel = T) -cat("summarise without keeping", timetaken(pt), "\n") diff --git a/tutorials/surf_2019_02_demo/.gitignore b/tutorials/surf_2019_02_demo/.gitignore deleted file mode 100644 index bef9413..0000000 --- a/tutorials/surf_2019_02_demo/.gitignore +++ /dev/null @@ -1,5 +0,0 @@ -.ipynb_checkpoints -surf_201902.nb.html -surf_201902.html -01_surf_201902.html -01_surf_201902.tex diff --git a/tutorials/surf_2019_02_demo/00_setup.r b/tutorials/surf_2019_02_demo/00_setup.r deleted file mode 100644 index 5cdf0de..0000000 --- a/tutorials/surf_2019_02_demo/00_setup.r +++ /dev/null @@ -1,41 +0,0 @@ -library(glue) -library(purrr) -library(fst) -library(tidyr) -library(ggplot2) -library(stringr) -library(xgboost) -library(lubridate) -library(disk.frame) - -raw_perf_data_path = "C:/data/Performance_All/" -#raw_perf_data_path = "d:/data/Performance_All_small/" - -# where the outputs go -outpath = "c:/data/fannie_mae_disk_frame/" -#outpath = "d:/data/fannie_mae_disk_frame_small/" - -Performance_ColClasses = - c("character", "character", "character", "numeric", "numeric", "numeric", "numeric", - "numeric", "character", "character", "character", "character", "character", "character", - "character", "character", "character", "numeric", "numeric", "numeric", "numeric", "numeric", - "numeric", "numeric", "numeric", "numeric", "numeric", "numeric", "character", "numeric", - "character") -Performance_Variables = - c("LOAN_ID", "Monthly.Rpt.Prd", "Servicer.Name", "LAST_RT", "LAST_UPB", "Loan.Age", - "Months.To.Legal.Mat" , "Adj.Month.To.Mat", "Maturity.Date", "MSA", "Delq.Status", - "MOD_FLAG", "Zero.Bal.Code", "ZB_DTE", "LPI_DTE", "FCC_DTE","DISP_DT", "FCC_COST", - "PP_COST", "AR_COST", "IE_COST", "TAX_COST", "NS_PROCS", "CE_PROCS", "RMW_PROCS", - "O_PROCS", "NON_INT_UPB", "PRIN_FORG_UPB_FHFA", "REPCH_FLAG", "PRIN_FORG_UPB_OTH", - "TRANSFER_FLG") %>% tolower - -dfiles = dir(raw_perf_data_path, full.names = T) -short_dfiles = dir(raw_perf_data_path) - -Acquisitions_Variables = c("LOAN_ID", "ORIG_CHN", "Seller.Name", "ORIG_RT", "ORIG_AMT", "ORIG_TRM", "ORIG_DTE" - ,"FRST_DTE", "OLTV", "OCLTV", "NUM_BO", "DTI", "CSCORE_B", "FTHB_FLG", "PURPOSE", "PROP_TYP" - ,"NUM_UNIT", "OCC_STAT", "STATE", "ZIP_3", "MI_PCT", "Product.Type", "CSCORE_C", "MI_TYPE", "RELOCATION_FLG") %>% tolower() - -Acquisition_ColClasses = c("character", "character", "character", "numeric", "numeric", "integer", "character", "character", "numeric", - "numeric", "character", "numeric", "numeric", "character", "character", "character", "character", "character", - "character", "character", "numeric", "character", "numeric", "numeric", "character") diff --git a/tutorials/surf_2019_02_demo/01_surf_201902.rmd b/tutorials/surf_2019_02_demo/01_surf_201902.rmd deleted file mode 100644 index 418ef86..0000000 --- a/tutorials/surf_2019_02_demo/01_surf_201902.rmd +++ /dev/null @@ -1,131 +0,0 @@ ---- -title: "Simple Fannie Mae Example" -output: - html_document: - df_print: paged - pdf_document: default ---- - -```{r} -# 2_exploratory.r -#source("../fannie_mae/00_setup.r") -source("00_setup.r") -``` - -# Assign a `disk.frame` -You use `disk.frame(path)` to assign a disk.frame - -```{r} -dir(file.path(outpath, "fm_with_harp")) %>% - head -``` - -Use the function `disk.frame` to assign a folder -```{r} -fm_with_harp = - disk.frame(file.path(outpath, "fm_with_harp")) - -fm_with_harp -``` - -## Simple info about disk.frame -```{r} -nrow(fm_with_harp) - -ncol(fm_with_harp) -``` - - -```{r} -head(fm_with_harp) -``` - -```{r} -tail(fm_with_harp) -``` - -## Simple group-by example - -I want create the `sum(default_12m)` group by month. The sql statement looks like - -```sql -select - monthly.rpt.prd, - sum(default_12m) as n_defaults, - count(*) as cnt, - n_default/cnt as odr # observed default rate -from - table -group by - monthly.rpt.prd; -``` -This analysis only uses two columns, namely `default_12m` and `monthly.rpt.prd`. So I use `srckeep` to ensure that only those two columns are loaded. - -The backend for disk.frame is fst which allows **random access** to columns AKA load only what you need! This is importance for performance! -```{r} -system.time(a_wh1 <- fm_with_harp %>% - srckeep(c("default_12m","monthly.rpt.prd")) %>% - group_by(monthly.rpt.prd) %>% - summarise( - N = n(), - n_defaults = sum(default_12m, na.rm = T))) -``` -The error recognises that the `shardkey` is `loan_id` which is not the group-by variable - `monthly.rpt.prd`. - -The operations returned almost instantly, this is because the computation was done lazily. For now operations are recorded. No computation will take place until `collect` or `compute` is called. - -Now **play** the recording by calling `collect`. This will take a minute. Back to the slides: - -```{r} -system.time(a_wh2 <- a_wh1 %>% collect) # 60~70 plugged in - -a_wh2 -``` - -```{r} -# temporary -# a_wh1 %>% -# srckeep(c("monthly.rpt.prd", "default_12m")) %>% -# map(function(chunk) { -# chunk[1,] -# }) %>% -# collect -``` - - -```{r} -# temporary -# a_wh1 %>% -# srckeep(c("monthly.rpt.prd", "default_12m")) %>% -# map(~{ -# .x[1,.SD] -# }) %>% - # collect -``` - - -once `collect` is called the resultant data is stored as a data.frame. - -However this is not the correct result, as the group by was performed within each chunk. Hence we need a second stage group by. The second group by takes no time at all, as everything was done in memory - -```{r} -system.time(a_wh3 <- a_wh2 %>% - group_by(monthly.rpt.prd) %>% - summarise( - odr = sum(n_defaults)/sum(N) - ) %>% - rename( - Date = monthly.rpt.prd, - `Observed Default Rate%` = odr - )) -``` - -Manipulate the data and plot -```{r} -a_wh4 = a_wh3 %>% - gather(key = type, value=rate, -Date) - -ggplot(a_wh4) + - geom_line(aes(x=Date, y = rate, colour = type)) + - ggtitle("Fannie Mae Observed Default Rate over time & HARP Conversion Rate") -``` diff --git a/tutorials/surf_2019_02_demo/10_d_Fannie_mae_1_8b.r b/tutorials/surf_2019_02_demo/10_d_Fannie_mae_1_8b.r deleted file mode 100644 index 81c1a33..0000000 --- a/tutorials/surf_2019_02_demo/10_d_Fannie_mae_1_8b.r +++ /dev/null @@ -1,6 +0,0 @@ -source("inst/fannie_mae/00_setup.r") - -fm_with_harp = - disk.frame(file.path(outpath, "fm_with_harp")) - -fm_with_harp diff --git a/tutorials/surf_2019_02_demo/10_d_Fannie_mae_1_8b.rmd b/tutorials/surf_2019_02_demo/10_d_Fannie_mae_1_8b.rmd deleted file mode 100644 index 4a6f760..0000000 --- a/tutorials/surf_2019_02_demo/10_d_Fannie_mae_1_8b.rmd +++ /dev/null @@ -1,54 +0,0 @@ ---- -title: "Simple Fannie Mae Example" -output: - html_document: - df_print: paged - pdf_document: default ---- - -```{r} -source("../fannie_mae/00_setup.r") - -fm_with_harp = - disk.frame(file.path(outpath, "fm_with_harp")) - -fm_with_harp -``` - - -```{r} -system.time(a_wh1 <- fm_with_harp %>% - srckeep(c("default_12m","monthly.rpt.prd")) %>% - group_by(monthly.rpt.prd) %>% - summarise( - N = n(), - n_defaults = sum(default_12m, na.rm = T)) %>% - collect %>% - group_by(monthly.rpt.prd) %>% - summarise( - odr = sum(n_defaults)/sum(N) - ) %>% - rename( - Date = monthly.rpt.prd, - `Observed Default Rate%` = odr - )) -``` - - -```{r} -a_wh4 = a_wh1 %>% - gather(key = type, value=rate, -Date) - -ggplot(a_wh4) + - geom_line(aes(x=Date, y = rate, colour = type)) + - ggtitle("Fannie Mae Observed Default Rate over time & HARP Conversion Rate") -``` - -$$ -600 = a + b\times\log(\frac{1-p}{p}) = a + b\times\log(30)\\ -620 = a + b\times(\alpha + \vec{X}^T\vec{\beta}) = a + b\times\log(60) -$$ - -$$ -620 = a + b\times(\alpha + \vec{X}^T\vec{\beta}) = a + b\times\log(60) -$$ \ No newline at end of file diff --git a/tutorials/surf_2019_02_demo/10_e_AutoML_XGBoost_Scorecards.r b/tutorials/surf_2019_02_demo/10_e_AutoML_XGBoost_Scorecards.r deleted file mode 100644 index a87a894..0000000 --- a/tutorials/surf_2019_02_demo/10_e_AutoML_XGBoost_Scorecards.r +++ /dev/null @@ -1,112 +0,0 @@ -source("inst/fannie_mae_10pct/00_setup.r") -library(disk.frame) -library(xgboost) -acqall_dev = disk.frame(file.path(outpath, "appl_mdl_data_sampled_dev")) - -add_var_to_scorecard = disk.frame:::add_var_to_scorecard - -# 5d develop a function to test all variables -check_which_is_best <- function(df, target, features, monotone_constraints, format_fns, weight=NULL) { - - prev_pred = NULL - ws = NULL - vars_scr = NULL - - while(length(features) > 0) { - # try every var - - res = furrr::future_map(1:length(features), ~add_var_to_scorecard( - #res = purrr::map(1:length(features), ~add_var_to_scorecard( - df, - target, - features[.x], - monotone_constraints = monotone_constraints[.x], - prev_pred = prev_pred, - format_fn = format_fns[[.x]], - weight, - save_model_fname = glue::glue("{features[.x]}.xgbm") - )) - - # which has the best auc - w = which.max(map_dbl(res, ~.x$auc)) - ws = c(ws, w) - feature_to_fit = features[w] - print(glue::glue("choosen: {feature_to_fit}")) - - var_scr1 = res[[w]] - eval(parse(text = glue::glue("vars_scr = c(vars_scr, list({feature_to_fit} = var_scr1))"))) - - features = features[-w] - format_fns = format_fns[-w] - monotone_constraints = monotone_constraints[-w] - - prev_pred = var_scr1$prev_pred - } - vars_scr -} - -num_vars = c("mi_pct", "orig_amt", "orig_rt", "ocltv", "mi_type", "cscore_c", "oltv", "dti", "orig_trm", "cscore_b", "num_bo", "num_unit", "orig_dte", "frst_dte") -num_vars_mon = c(0 , 1 , 1 , 1 , 0 , -1 , 1 , 1 , 1 , -1 , -1 , 0 , 0 , 0) -num_var_fmt_fn = c(map(1:10, ~base::I), map(1:2, ~as.numeric), map(1:2, ~function(x) { - as.numeric(substr(x, 4, 7)) -})) - -cat_vars = setdiff(names(acqall_dev), num_vars) %>% - setdiff( - c("loan_id", "default_next_12m", "first_default_date", "mths_to_1st_default", "zip_3", "weight")) - -pt = proc.time() -res_all = check_which_is_best( - acqall_dev, - "default_next_12m", - features = c(num_vars, cat_vars), - monotone_constraints = c(num_vars_mon, rep(0, length(cat_vars))), - format_fns = c(num_var_fmt_fn, map(1:length(cat_vars), ~base::I)), - weight = "weight" -) -timetaken(pt) - -aucs = map_dbl(res_all, ~.x$auc) -plot( - aucs, - main = "Fannie Mae Application Scorecard AUC", - ylab="AUC", - xlab = "Number of features") - -if(F) { - library(gganimate) - auc_dt = data.table(num_of_vars = 1:length(aucs), auc = aucs) - auc_dt %>% - ggplot(aes(num_of_vars, auc)) + - geom_point() + - transition_states( - num_of_vars, - transition_length = 2, - state_length = 1 - ) + - enter_fade() + - exit_shrink() + - ease_aes('sine-in-out') -} - -#saveRDS(res_all, "model.rds") - -if(F) { - rescat = check_which_is_best( - acqall_dev, - "default_next_12m", - c("seller.name", "oltv", "state", "dti", "mi_type"), - c(0 , 1 , 0 , 1 , 0 ), - map(1:5, ~base::I) - ) - - - system.time(res <- check_which_is_best( - acqall1, - "default_next_12m", - c("ocltv", "cscore_c", "mi_type", "oltv", "dti", "orig_trm", "cscore_b", "num_bo"), - c(1 , -1 , 0 , 1 , 1 , 1 , -1 , -1 ), - c(map(1:7, ~base::I), list(as.numeric)))) -} - - diff --git a/tutorials/surf_2019_02_demo/10_f_make_scorecard_visible.R b/tutorials/surf_2019_02_demo/10_f_make_scorecard_visible.R deleted file mode 100644 index 675a81c..0000000 --- a/tutorials/surf_2019_02_demo/10_f_make_scorecard_visible.R +++ /dev/null @@ -1,118 +0,0 @@ -source("inst/fannie_mae/00_setup.r") -library(disk.frame) - -auc = disk.frame:::auc -acqall_val = disk.frame(file.path(outpath, "appl_mdl_data_sampled_val2")) -df = acqall_val -#the model -mdl = readRDS("model.rds") - -#' score one variable -score_one_var <- function(feature, x, bins, bias, format_fn = base::I) { - print(feature) - # numeric features (post format_fn) has only two columns in bins - if(ncol(bins) == 2) { - x1 = format_fn(x) - binsNA = evalparseglue("bins[is.na({feature}),]") - bins = evalparseglue("bins[!is.na({feature}),]") - score = bins$score[cut(x1, c(-Inf, bins[[2]], Inf))] - - if(nrow(binsNA) == 1) { - score[is.na(x1)] = binsNA[[1]] - } - } else { - x1 = format_fn(x) - x1df = evalparseglue("data.table({feature} = x1)") - x1df_scr = evalparseglue("merge(x1df, bins, by = '{feature}', all.x=T)") - score = x1df_scr$score - } - # if there are still na then assign the bias - #browser(expr = any(is.na(score))) - if(any(is.na(score))) { - warning(glue::glue("the feature `{feature}` has {sum(is.na(score))} unassigned scores out of {length(score)}, assigning them the bias = {bias}")) - score[is.na(score)] = bias - } - - score -} - - -#' Socre a scorecard -#' @param mdl the xg scorecard model -#' @param df a disk.frame -score_xg_scorecard <- function(df, mdl) { - res = furrr::future_map_dfc(mdl, ~{ - #res = purrr::map_dfc(mdl, ~{ - var = df %>% - srckeep(.x$feature) %>% - collect(parallel = F) - x = var[[1]] - feature = .x$feature - bins = .x$bins - bias = .x$bias - format_fn = .x$format_fn - score_one_var(feature, x, bins, bias, format_fn) - }) %>% rowSums - res -} - -plot_auc <- function(df, line = F) { - target = df %>% - srckeep("default_next_12m") %>% - collect(parallel = F) - - pt = proc.time() - target$score = score_xg_scorecard(df, mdl) - timetaken(pt) - - AUC = auc(target$default_next_12m, target$score) - GINI = 2*AUC-1 - - target[,negscore := -score] - setkey(target, negscore) - cts = target[,.(ctot = (1:.N)/.N, cbad = cumsum(default_next_12m)/sum(default_next_12m))] - - if(line == F) { - plot(cts[seq(1,.N, length.out=100),], type="l", xlim=c(0,1), - main=glue::glue("On Validation - AUC: {AUC %>% round(2)}; GINI: {GINI %>% round(2)}")) - abline(a=0,b=1,lty=2) - abline(h=0) - abline(v=0) - } else { - lines(cts[seq(1,.N, length.out=100),], lty = 3, col = "blue") - legend("topleft", c("val AUC", "random", "dev AUC"), lty=c(1,2,3), col=c("black","blue","black")) - } -} - -system.time(plot_auc(acqall_val)) - -# score on whole ---------------------------------------------------------- -acqall_dev = disk.frame(file.path(outpath, "appl_mdl_data_sampled_dev2")) -system.time(plot_auc(acqall_dev, line = T)) - -scorecard = map_dfr(mdl, ~{ - res = .x$bins - evalparseglue("res[,feature_lbl := as.character({.x$feature})]") - res[,variable := .x$feature] - res %>% - select(variable, feature_lbl, score ) %>% - mutate(score = round(-score*20/log(2))) -}) - -saveRDS(scorecard, "scorecard.rds") -scorecard = readRDS("scorecard.rds") -#View(scorecard) - -scorecard[, - low := c(-Inf, feature_lbl[-.N]), variable] -setnames(scorecard, "feature_lbl", "high") - -scorecard=scorecard[,.(variable, low, high, score)] - -DT::datatable(scorecard, options = list(pageLength=20)) - -# score on whole ---------------------------------------------------------- -# acqall = disk.frame(file.path(outpath, "appl_mdl_data")) -# system.time(plot_auc(acqall)) - - diff --git a/tutorials/surf_2019_02_demo/11_speedglm.R b/tutorials/surf_2019_02_demo/11_speedglm.R deleted file mode 100644 index 3e180cb..0000000 --- a/tutorials/surf_2019_02_demo/11_speedglm.R +++ /dev/null @@ -1,51 +0,0 @@ -source("inst/fannie_mae/00_setup.r") -library(disk.frame) -library(speedglm) -library(biglm) - -acqall_dev = disk.frame(file.path(outpath, "appl_mdl_data")) -library(speedglm) - -head(acqall_dev) - -#' A streaming function for speedglm -#' @param df a disk.frame -stream_shglm <- function(df) { - i = 0 - is = sample(nchunks(df), replace = F) - function(reset = F) { - - if(reset) { - print("reset") - i <<- 0 - } else { - i <<- i + 1 - #if (i > 3) { - if (i > nchunks(df)) { - return(NULL) - } - print(glue::glue("streaming: {i}/{nchunks(df)}")) - return(get_chunk(df, is[i])) - #return(get_chunk(df, i)) - } - } -} - -acqall_dev1 = acqall_dev %>% delayed(~{ - .x[,oltv_band := cut(oltv, c(-Inf, seq(60,100,by=20)))] - .x[,dti_band := addNA(cut(dti, c(-Inf, c(20,100), Inf)))] - .x -}) - -streamacq = stream_shglm(acqall_dev1) - -m = bigglm( - default_next_12m ~ oltv_band-1, - data = streamacq, - family=binomial()) -summary(m) - -# not gonna work -shglm(default_next_12m ~ oltv_band -1 #+ dti_band + - 1 - , - datafun = streamacq, family=binomial()) \ No newline at end of file diff --git a/tutorials/surf_2019_02_demo/12_keras.r b/tutorials/surf_2019_02_demo/12_keras.r deleted file mode 100644 index e038fb0..0000000 --- a/tutorials/surf_2019_02_demo/12_keras.r +++ /dev/null @@ -1,180 +0,0 @@ -source("inst/fannie_mae_10pct/00_setup.r") -library(disk.frame) -library(keras) - -acqall_dev = disk.frame(file.path(outpath, "appl_mdl_data_sampled")) - -#' A streaming function for speedglm -#' @param df a disk.frame -stream_shglm <- function(df) { - i = 0 - is = sample(nchunks(df), replace = F) - function(reset = F) { - - if(reset) { - print("you've reset") - i <<- 0 - } else { - i <<- i + 1 - #if (i > 4) { - if (i > nchunks(df)) { - return(NULL) - } - print(glue::glue("{i}/{nchunks(df)}")) - return(get_chunk(df, is[i])) - #return(get_chunk(df, i)) - } - } -} - -acqall_dev1 = acqall_dev %>% delayed(~{ - .x[,oltv_band := cut(oltv, c(-Inf, 60,80, Inf))] - #.x[,scr_band := addNA(cut(cscore_b, c(-Inf, 627,700, Inf)), ifany=F)] - .x[,scr_band := addNA(cut(cscore_b, c(-Inf, 700,716,725,742,748,766,794, Inf)), ifany=F)] - - .x -}) - -if(F) { - aa = acqall_dev1 %>% collect - glm(default_next_12m ~ oltv_band + scr_band - 1, data=aa) -} - -head(acqall_dev1) - -# compile a kera model -build_model <- function() { - model <- keras_model_sequential() %>% - layer_dense(units = 2, activation = 'softmax') - - - model %>% compile( - loss = "categorical_crossentropy", - optimizer = 'sgd' - ) - - model -} - -# compiling the model ~ -system.time(model <- build_model()) - -#ol = levels(acqall_dev1[,oltv_band, keep=c("oltv","cscore_b")]) -#dl = levels(acqall_dev1[,scr_band, keep=c("oltv","cscore_b")]) - -ol = levels(get_chunk(acqall_dev1,1)[,oltv_band]) -dl = levels(get_chunk(acqall_dev1,1)[,scr_band]) - - -#for(i in 1:nchunks(acqall_dev1)) { -kk <- function() { - - j = 0 - done = F - ii = 0 - while(!done) { - j <- j + 1 - si = sample(nchunks(acqall_dev1), nchunks(acqall_dev1)*0.3) - osi = setdiff(1:nchunks(acqall_dev1), si) - - system.time(a <- map_dfr(si, ~{ - - ii <- ii + 1 - i = .x - if(ii %% 20 == 0) print(glue::glue("{j}:{ii} {Sys.time()}")) - - a = get_chunk(acqall_dev1, i) - #a[,.(sum(default_next_12m), .N),oltv_band] - - a1 = - cbind( - a[,keras::to_categorical(as.integer(oltv_band) - 1)] - #,a[,keras::to_categorical(as.integer(scr_band) - 1)] - ) - - at = a[,default_next_12m*1] - Y_train = keras::to_categorical(at) - - hist = model %>% fit( - a1, - Y_train, - epochs = 1, - validation_split = 0.2, - verbose = 0 - ) - - gw = get_weights(model) - - gwdt = as.data.table(gw[1]) - setnames(gwdt, names(gwdt), c("non_default", "default")) - gwdt$i =i - gwdt$var = c( - rep("oltv_band", length(ol)) - #, rep("scr_band", length(dl)) - ) - gwdt$band = c(ol,dl - ) - gwdt - })) - - some_chunks <- map(osi, ~{ - i = .x - a = get_chunk(acqall_dev1, i) - - a1 = - cbind( - a[,keras::to_categorical(as.integer(oltv_band) -1)] - #,a[,keras::to_categorical(as.integer(scr_band)-1)] - ) - a1 - }) %>% reduce(rbind) - - outcomes <- map(osi, ~{ - i = .x - a = get_chunk(acqall_dev, i, keep=c("default_next_12m")) - a[,default_next_12m] - }) %>% unlist - - auc = auc(outcomes, predict(model, some_chunks)[,2]) - - # scores - p = predict(model, diag( - length(ol) - #+length(dl) - ))[,2] - a_b = log(p/(1-p)) - - scalar = 20/log(2) - intercept = 20*log(536870912/15)/log(2) - - a = mean(a_b) - b = a_b - a - done <- length(unique(sign(diff(b)))) == 1 - - scrs = data.table(base_scr = round(a*scalar+intercept,0), scr = round(-b*scalar,0)) - scrs$var = c( - rep("oltv_band", length(ol)) - #, rep("scr_band", length(dl)) - ) - scrs$band = c(ol - , dl - ) - scrs = scrs[band != "bias"] - - scrs = scrs[,.(var, band, scr, base_scr)] - print(glue::glue("AUC: {auc}")) - print(scrs) - } - scrs -} - -pt = proc.time() -scrs = kk() -timetaken(pt) -View(scrs) - -#AUC: 0.599790504924901 -# var band scr base_scr -# 1: oltv_band (-Inf,60] 10 351 -# 2: oltv_band (60,80] 10 351 -# 3: oltv_band (80, Inf] -21 351 \ No newline at end of file diff --git a/tutorials/surf_2019_02_demo/13_logistic_regression.r b/tutorials/surf_2019_02_demo/13_logistic_regression.r deleted file mode 100644 index db82258..0000000 --- a/tutorials/surf_2019_02_demo/13_logistic_regression.r +++ /dev/null @@ -1,201 +0,0 @@ - -two_var = acqall_dev[,.(cscore_b, ocltv, mi_pct, default_next_12m)] - -all_vars = collect(acqall_dev) - -non_char = - names(all_vars)[sapply(all_vars, typeof) != c("character")] %>% - setdiff("first_default_date") - -# -# all_vars = tibble(all_vars) -# library(dplyr) -# cor( -# all_vars -# , use = "pairwise.complete.obs") - -a = sort(non_char) -alvarsn = all_vars[, ..a] - -cdt = cor(alvarsn, use = "pairwise.complete.obs") - -cdt2 = cdt[,which(colnames(cdt) == "default_next_12m")] - -cdt3 = data.table(names = names(cdt2), cor = cdt2, abs_cor = abs(cdt2)) - -cdt3 = cdt3[order(abs_cor, decreasing = T)] - -DT::datatable(cdt3) - - -corrplot::corrplot(cdt, method="color", diag=F) - -# Binning - -two_var1 = two_var[!is.na(ocltv),][order(ocltv),.(ocltv, default_next_12m)] - - -two_var2 = two_var1[, .(ndef = sum(default_next_12m), .N), ocltv] - -two_var3 = two_var2[,.(ocltv, ndef, N, GB_odds = cumsum(N)/cumsum(ndef))] - - -wm = which(two_var3$GB_odds == max(Filter(is.finite, two_var3$GB_odds))) -wm3 = wm -two_var3[,.(ocltv, GB_odds)] %>% - plot(main = "Cumulative Good/Bad Odds", - xlab="Originative LTV (ocltv)", - ylab="Observed Default Rate (ODR)", - ylim=c(0,600)) - -abline(v = two_var3[wm, ocltv], lty=2) -legend("bottomright", c("ODR", "max ODR location") - , lty=c(NA,2) - , pch = c(1,NA) - ) - -## plot -two_var4 = two_var3[-(1:wm),.(ocltv, N, ndef, GB_odds = cumsum(N)/cumsum(ndef))] - -two_var3[-(1:wm),.(ocltv, GB_odds = cumsum(N)/cumsum(ndef))] %>% - plot(main = "Cumulative Observed Default Rate", xlab="Originative LTV (ocltv)", - ylab="Observed Default Rate (ODR)", - xlim=c(0, 120), - ylim = c(0, 600)) - -lines(two_var3[, .(c(0,two_var3$ocltv[wm]), GB_odds[wm])]) - - -wm = which(two_var4$GB_odds == max(Filter(is.finite, two_var4$GB_odds))) - -abline(v = two_var4[wm, ocltv], lty=2) -legend("bottomright", c("ODR", "max ODR location") - , lty=c(NA,2) - , pch = c(1,NA) -) - -## plot 2 -two_var5 = two_var4[-(1:wm),.(ocltv, N, ndef, GB_odds = cumsum(N)/cumsum(ndef))] - -two_var4[-(1:wm),.(ocltv, GB_odds = cumsum(N)/cumsum(ndef))] %>% - plot(main = "Cumulative Observed Default Rate", xlab="Originative LTV (ocltv)", - ylab="Observed Default Rate (ODR)", - xlim=c(0, 120), - ylim = c(0, 600)) - -lines(two_var3[, .(c(0,two_var3$ocltv[wm3]), GB_odds[wm3])]) - -lines(two_var4[, .(c(two_var3$ocltv[wm3],two_var4$ocltv[wm]), GB_odds[wm])]) - -wm = which(two_var4$GB_odds == max(Filter(is.finite, two_var4$GB_odds))) - -legend("bottomright", c("ODR", "max ODR location") - , lty=c(NA,2) - , pch = c(1,NA) -) - -abline(v=51, lty=2) - -var1 = res_all[[1]] - -all_vars[, cscore_bin := cut( - all_vars[,var1$feature, with = F][[1]] , - var1$bin[[2]] %>% setdiff(c(701,713))) %>% addNA] - -bp1 = all_vars[order(cscore_bin),.(sum(default_next_12m)/.N), cscore_bin] -barplot( - bp1$V1, - names.arg = bp1[[1]], - main = "cscore_b bin Default Rate" -) - -setnames(bp1,"V1","def_rate") - -overall_df = all_vars[,sum(default_next_12m)/.N] - -bp1[,whole := overall_df] - -bp1[,woe := log((1-def_rate)/def_rate) - log((1-whole)/whole)] - -DT::datatable(bp1) - -glm(def_rate ~ I(-woe), data = bp1, family = binomial) - -all_vars[,csr_woe := bp1$woe[cscore_bin]] - -DT::datatable(all_vars[1:10,.(default_next_12m, cscore_bin, csr_woe)]) - -glm( - default_next_12m ~ csr_woe, - family = binomial, - data = all_vars -) - -View(two_var) - - - - - - -var2 = res_all[[3]] -all_vars[, ocltv_bin := cut( - all_vars[,var2$feature, with = F][[1]], - var2$bins[[2]]) %>% addNA] - -bp2 = all_vars[order(ocltv_bin),.(sum(default_next_12m)/.N), ocltv_bin] -barplot(bp2$V1, names.arg = bp2[[1]], main = "ocltv Orig LTV Default Rate") - - -var2 = res_all[[9]] -var2$feature -all_vars[, dti_bin := cut( - all_vars[,var2$feature, with = F][[1]], - var2$bins[[2]]) %>% addNA] - -dtrain <- xgboost::xgb.DMatrix( - label = two_var[,default_next_12m], - data = as.matrix(two_var[,cscore_b])) - -m2 <- xgboost::xgboost( - data=dtrain, - monotone_constraints = -1, # awesome!!! - nrounds = 1, # one tree ONLY - eta = 1, # learning rate - - - - objective = "binary:logitraw", - tree_method="exact", - max_depth = 2, - base_score = two_var[,sum(default_next_12m)/.N] -) - -xgboost::xgb.plot.tree(model=m2) - -plot(m2) - -bp2 = all_vars[order(dti_bin),.(sum(default_next_12m)/.N), dti_bin] -barplot(bp2$V1, names.arg=bp2[[1]], main="dti Debt-to-income Ratio Default Rate") - - -dt <- xgb.model.dt.tree(model = m2) - - -bin_eg = data.table( - Low = c(-Inf, 597, 657, 716), - High = c(597, 657, 716, Inf) -) - -View(bin_eg) - - -m = glm( - default_next_12m ~ - cscore_bin + - ocltv_bin + - dti_bin, - family = binomial, - data=all_vars) - -DT::datatable(broom::tidy(m), options=list(pageLength=16)) diff --git a/tutorials/surf_2019_02_demo/surf_2019_02_demo.ipynb b/tutorials/surf_2019_02_demo/surf_2019_02_demo.ipynb deleted file mode 100644 index 93e1c58..0000000 --- a/tutorials/surf_2019_02_demo/surf_2019_02_demo.ipynb +++ /dev/null @@ -1,68 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# 2_exploratory.r\n", - "source(\"inst/fannie_mae/0_setup.r\")\n", - "\n", - "fm_with_harp = disk.frame(file.path(outpath, \"fm_with_harp\"))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "head(fm_with_harp)\n", - "\n", - "nrow(fm_with_harp)\n", - "\n", - "# need a two stage summary\n", - "system.time(a_wh1 <- fm_with_harp %>% \n", - " srckeep(c(\"default_12m\",\"monthly.rpt.prd\")) %>% \n", - " group_by(monthly.rpt.prd) %>% \n", - " summarise(\n", - " N = n(), \n", - " n_defaults = sum(default_12m, na.rm = T)) %>% \n", - " collect(parallel = T) %>%\n", - " group_by(monthly.rpt.prd) %>% \n", - " summarise(\n", - " odr = sum(n_defaults)/sum(N)\n", - " ) %>% \n", - " rename(\n", - " Date = monthly.rpt.prd,\n", - " `Observed Default Rate%` = odr\n", - " ))\n", - " \n", - "\n", - "a_wh2 = a_wh1 %>% gather(key = type, value=rate, -Date)\n", - "\n", - "ggplot(a_wh2) + \n", - " geom_line(aes(x=Date, y = rate, colour = type)) +\n", - " ggtitle(\"Fannie Mae Observed Default Rate over time & HARP Conversion Rate\")" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "R", - "language": "R", - "name": "ir" - }, - "language_info": { - "codemirror_mode": "r", - "file_extension": ".r", - "mimetype": "text/x-r-source", - "name": "R", - "pygments_lexer": "r", - "version": "3.5.3" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/tutorials/tutorial.r b/tutorials/tutorial.r deleted file mode 100644 index c3e459f..0000000 --- a/tutorials/tutorial.r +++ /dev/null @@ -1,124 +0,0 @@ -library(disk.frame) -setup_disk.frame() - -a = disk.frame::disk.frame("c:/data/fannie_mae_disk_frame/fm.df/") - - -nrow(a) -ncol(a) - -head(a) - -system.time(asample <- a %>% - srckeep("loan_id") %>% - filter(!duplicated(loan_id)) %>% - sample_frac(0.01) %>% - collect) - -asample = data.frame(loan_id = asample) - -system.time( - a1 <-a %>% - cmap(~{ - inner_join(.x, asample, by = "loan_id") - }, - lazy = FALSE, - outdir = "c:/data/fannie_mae_disk_frame/fm1pct.df/", - overwrite = TRUE)) - - -a2 = collect(a1) - -data.table::fwrite(a2, "a2.csv") -arrow::write_parquet(a2, "a2.parquet") -fst::write_fst(a2, "a2.fst") - -a2$date = as.Date(a2$monthly.rpt.prd, "%m/%d/%Y") -a2$delq.status = as.numeric(a2$delq.status) - -a2[,.N,delq.status] -a2[is.na(delq.status),delq.status := 0] - -head(a2) - -setkey(a2, loan_id, date) - -system.time( - a3 <- - setDT(a2)[order(date), apply(.SD[,shift(delq.status, (1:24), type="lead")], 1 , function(x) max(x, na.rm=TRUE)), loan_id] -) - -setnames(a3, names(a3), c("loan_id", "default_n24m")) -setDT(a3)[is.infinite(default_n24m), default_n24m := 0] -a3[,.N, default_n24m] - -a2$default_n24m = a3$default_n24m >= 4 - -nrow(a2) - -a2[,.N, default_n24m] - -arrow::write_parquet(a2, "a2.parquet") -arrow::write_parquet(a2, "data.parquet") -fst::write_fst(a2, "a2.fst") -data.table::fwrite(a2, "a2.csv") - -table(sapply(a2, mode)) - -a2 = a2 %>% - janitor::clean_names() - -str(a2$date) -table(sapply(a2, class)) - -map2(a2, names(a2), ~{ - cx = class(.x) - list( - name = .y, - type = - ifelse( - cx == "character", - "text", - ifelse( - cx == "Date", - "datetime", - ifelse( - cx == "logical", - "bool", - ifelse( - cx == "numeric", - "numeric", - "error" - ) - ) - )), - semantic_type = - ifelse( - cx == "character", - "TextColumn", - ifelse( - cx == "Date", - "DateTimeColumn", - ifelse( - cx == "logical", - "BooleanColumn", - ifelse( - cx == "numeric", - "NumericColumn", - "error" - ) - ) - )) - ) %>% jsonlite::toJSON(auto_unbox = TRUE) -}) %>% unlist %>% - paste0(collapse = ", ") -> outjson - -res = glue::glue('{"column": [|outjson|]}', .open = "|", .close = "|") - -write(res, "schema.json") - - - - - - diff --git a/tutorials/user_2020/useR! 2020 Tutorial part 1/.gitignore b/tutorials/user_2020/useR! 2020 Tutorial part 1/.gitignore deleted file mode 100644 index 2960c12..0000000 --- a/tutorials/user_2020/useR! 2020 Tutorial part 1/.gitignore +++ /dev/null @@ -1 +0,0 @@ -useR!-2020-Tutorial-part-1.html diff --git a/tutorials/user_2020/useR! 2020 Tutorial part 1/useR! 2020 Tutorial part 1.Rmd b/tutorials/user_2020/useR! 2020 Tutorial part 1/useR! 2020 Tutorial part 1.Rmd deleted file mode 100644 index 918ac49..0000000 --- a/tutorials/user_2020/useR! 2020 Tutorial part 1/useR! 2020 Tutorial part 1.Rmd +++ /dev/null @@ -1,147 +0,0 @@ ---- -title: "useR! 2020 Tutorial Part 1" -output: learnr::tutorial -runtime: shiny_prerendered ---- - -```{r setup, include=FALSE} -library(learnr) -knitr::opts_chunk$set(echo = FALSE) -``` - -## Intro - -### What is disk.frame for? - -### Installation - -```{r installation, exercise = TRUE} -# to install CRAN version -install.packages("disk.frame") - -# Uncomment below to install latest development version -# install.packages("remotes") -# remotes::install_github("xiaodaigh/disk.frame") -``` - -On some platforms, it helps to specify the repo if you encounter an -```r -install.packages("disk.frame", repo="https://cran.rstudio.com") -``` -### Download data? - -### Generate data? - - -## Starting disk.frame - -### Exercise - -{disk.frame} can use multiple cores and may transfer data between **workers**. - -So there are some setting up code that is recommended for all users. - -```{r setting_up, exercise=TRUE} -library(disk.frame) - -``` - -You should see a message like this from the console -``` -# this will set up disk.frame with multiple workers -setup_disk.frame() -# this will allow unlimited amount of data to be passed from worker to worker -options(future.globals.maxSize = Inf) -``` - - -```{r setting_up1, exercise=TRUE} -# this will set up disk.frame with multiple workers -setup_disk.frame() -# this will allow unlimited amount of data to be passed from worker to worker -options(future.globals.maxSize = Inf) -``` - -If you forget - -```{r setting_up2, exercise=TRUE} -disk.frame::show_boilerplate() - -# or equivalently -disk.frame::show_ceremony() -``` - -TODO insert GIF of inserting code into RStudio. - -## Ingesting Data - -To use {disk.frame} as your data processing engine, you need to store your data in the disk.frame format. - -### The `disk.frame` format - -The {disk.frame} format is a folder containing multiple [`fst`](https://www.fstpackage.org/) files. And the files are named `1.fst`, `2.fst` etc, where each `fst` file is called a **chunk**. - -TODO add quiz - -```{r ingest_data_frame, exercise=TRUE} -mtcars.df = as.disk.frame(mtcars) -mtcars.df -``` - -This works but if your data fits into RAM, you probably don't need {disk.frame}. But perhaps you data.frame fits into RAM but you can't merge it with another dataset, because that would require additional RAM. So storing it as disk.frame before merging can help. - -## Ingesting CSV - -```{r print-limit, exercise=TRUE, exercise.eval=TRUE} -data.table::fwrite(mtcars, "mtcars.csv") -# alternatively -#write.csv(mtcars, "mtcars.csv") - -mtcars.df = csv_to_disk.frame("mtcars.csv") -mtcars.df -``` - -Notice how `mtcars.df` are stored in a temporary folder? You can move it using - -```{r} -move_to(mmtcars.df, "/some/path/permanent") -``` - -or - -```{r} -mtcars.df = csv_to_disk.frame( - "mtcars.csv", - outdir = "/some/path/permanent") - -lib1 = libname("some_folder") - -lib1$mtcars2.df = lib1$mtcars.df %>% filter(mpg < 0.5) -``` - -TODO srckeep("") - - -### Quiz - -*You can include any number of single or multiple choice questions as a quiz. Use the `question` function to define a question and the `quiz` function for grouping multiple questions together.* - -Some questions to verify that you understand the purposes of various base and recommended R packages: - -```{r quiz} -quiz( - question("Which package contains functions for installing other R packages?", - answer("base"), - answer("tools"), - answer("utils", correct = TRUE), - answer("codetools") - ), - question("Which of the R packages listed below are used to create plots?", - answer("lattice", correct = TRUE), - answer("tools"), - answer("stats"), - answer("grid", correct = TRUE) - ) -) -``` - diff --git a/tutorials/vs-vaex/ok.py b/tutorials/vs-vaex/ok.py deleted file mode 100644 index 64effa8..0000000 --- a/tutorials/vs-vaex/ok.py +++ /dev/null @@ -1,5 +0,0 @@ -import vaex - -df = vaex.example() -df # Since this is the last statement in a cell, it will print the DataFrame in a nice HTML format. - From e615ac9fdc8a36deca8d7d5e721cfc9490b70165 Mon Sep 17 00:00:00 2001 From: ZJ Dai Date: Wed, 2 Feb 2022 22:10:10 +1100 Subject: [PATCH 11/16] update --- NAMESPACE | 2 + R/dplyr_verbs.r | 10 -- R/get_chunk.r | 1 + R/get_partition.r | 6 +- R/write_disk.frame.r | 7 +- R/zzz.r | 8 +- docs/404.html | 2 +- docs/LICENSE-text.html | 2 +- docs/articles/01-intro.html | 2 +- docs/articles/02-intro-disk-frame.html | 38 +++--- docs/articles/03-concepts.html | 2 +- docs/articles/04-ingesting-data.html | 2 +- docs/articles/05-data-table-syntax.html | 2 +- docs/articles/index.html | 2 +- docs/authors.html | 6 +- docs/index.html | 39 +++--- docs/pkgdown.yml | 2 +- docs/reference/add_chunk.html | 10 +- docs/reference/as.data.frame.disk.frame.html | 2 +- docs/reference/as.data.table.disk.frame.html | 2 +- docs/reference/as.disk.frame.html | 2 +- docs/reference/bind_rows.disk.frame.html | 2 +- docs/reference/chunk_group_by.html | 2 +- docs/reference/cmap.html | 4 +- docs/reference/cmap2.html | 2 +- docs/reference/collect.html | 2 +- docs/reference/colnames.html | 2 +- docs/reference/compute.disk.frame.html | 2 +- docs/reference/create_chunk_mapper.html | 2 +- docs/reference/csv_to_disk.frame.html | 2 +- docs/reference/delete.html | 2 +- docs/reference/df_ram_size.html | 2 +- docs/reference/dfglm.html | 2 +- docs/reference/disk.frame.html | 4 +- docs/reference/dplyr_verbs.html | 2 +- docs/reference/evalparseglue.html | 2 +- docs/reference/find_globals_recursively.html | 2 +- docs/reference/foverlaps.disk.frame.html | 2 +- docs/reference/gen_datatable_synthetic.html | 2 +- docs/reference/get_chunk.html | 6 +- docs/reference/get_chunk_ids.html | 20 +-- docs/reference/get_partition_paths.html | 132 ++++++++++++++++++ docs/reference/group_by.html | 2 +- docs/reference/groups.disk.frame.html | 2 +- docs/reference/head_tail.html | 2 +- docs/reference/index.html | 14 +- docs/reference/is_disk.frame.html | 2 +- docs/reference/join.html | 2 +- docs/reference/make_glm_streaming_fn.html | 2 +- docs/reference/merge.disk.frame.html | 2 +- docs/reference/move_to.html | 2 +- docs/reference/nchunks.html | 2 +- docs/reference/ncol_nrow.html | 2 +- docs/reference/one-stage-group-by-verbs.html | 2 +- docs/reference/overwrite_check.html | 2 +- docs/reference/partition_filter.html | 133 +++++++++++++++++++ docs/reference/play.html | 2 +- docs/reference/print.disk.frame.html | 2 +- docs/reference/pull.disk.frame.html | 2 +- docs/reference/purrr_as_mapper.html | 2 +- docs/reference/rbindlist.disk.frame.html | 2 +- docs/reference/rechunk.html | 6 +- docs/reference/recommend_nchunks.html | 2 +- docs/reference/remove_chunk.html | 8 +- docs/reference/sample.html | 44 +++--- docs/reference/setup_disk.frame.html | 2 +- docs/reference/shard.html | 2 +- docs/reference/shardkey.html | 2 +- docs/reference/shardkey_equal.html | 2 +- docs/reference/show_ceremony.html | 2 +- docs/reference/split_string_into_df.html | 126 ++++++++++++++++++ docs/reference/srckeep.html | 2 +- docs/reference/sub-.disk.frame.html | 2 +- docs/reference/tbl_vars.disk.frame.html | 2 +- docs/reference/write_disk.frame.html | 14 +- docs/reference/zip_to_disk.frame.html | 4 +- man/get_chunk.Rd | 2 + man/get_partition_paths.Rd | 4 + man/split_string_into_df.Rd | 3 + 79 files changed, 581 insertions(+), 176 deletions(-) create mode 100644 docs/reference/get_partition_paths.html create mode 100644 docs/reference/partition_filter.html create mode 100644 docs/reference/split_string_into_df.html diff --git a/NAMESPACE b/NAMESPACE index db105be..388b3d4 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -166,6 +166,7 @@ importFrom(dplyr,full_join) importFrom(dplyr,glimpse) importFrom(dplyr,group_by) importFrom(dplyr,group_by_drop_default) +importFrom(dplyr,group_map) importFrom(dplyr,group_vars) importFrom(dplyr,groups) importFrom(dplyr,inner_join) @@ -217,5 +218,6 @@ importFrom(utils,capture.output) importFrom(utils,head) importFrom(utils,memory.limit) importFrom(utils,tail) +importFrom(utils,type.convert) importFrom(utils,unzip) useDynLib(disk.frame) diff --git a/R/dplyr_verbs.r b/R/dplyr_verbs.r index 2924324..4958677 100644 --- a/R/dplyr_verbs.r +++ b/R/dplyr_verbs.r @@ -136,16 +136,6 @@ chunk_ungroup = create_chunk_mapper(dplyr::ungroup) # do not introduce it as it was never introduced #ungroup.disk.frame( < - create_dplyr_mapper(dplyr::ungroup, , warning_msg="`ungroup.disk.frame` is now deprecated. Please use `chunk_ungroup` instead. This is in preparation for a more powerful `group_by` framework") -add_count.disk.frame <- function(x, ...) { - warning("check if it works") - counts = x %>% - group_by({{...}}) %>% - summarize(n=n()) %>% - collect - - left_join.disk.frame(x, counts) -} - #' @export diff --git a/R/get_chunk.r b/R/get_chunk.r index 01700e0..c012598 100644 --- a/R/get_chunk.r +++ b/R/get_chunk.r @@ -4,6 +4,7 @@ #' @param keep the columns to keep #' @param full.names whether n is the full path to the chunks or just a relative path file name. Ignored if n is numeric #' @param ... passed to fst::read_fst or whichever read function is used in the backend +#' @param partitioned_info for internal use only. It's a data.frame used to help with filtering by partitions #' @export #' @examples #' cars.df = as.disk.frame(cars, nchunks = 2) diff --git a/R/get_partition.r b/R/get_partition.r index 925ff24..b43e68f 100644 --- a/R/get_partition.r +++ b/R/get_partition.r @@ -1,4 +1,5 @@ #' Turn a string of the form /partion1=val/partion2=val2 into data.frame +#' @param path_strs The paths in string form to break into partition format split_string_into_df <- function(path_strs) { paths = dirname(path_strs) %>% unique list_of_partitions = stringr::str_split(paths, "/") @@ -26,6 +27,9 @@ if(F) { } #' Get the partitioning structure of a folder +#' @param df a disk.frame whose paths will be used to determine if it's +#' folder-partitioned disk.frame +#' @importFrom utils type.convert get_partition_paths <- function(df) { stopifnot("disk.frame" %in% class(df)) path = tools::file_path_as_absolute(attr(df, "path")) @@ -44,7 +48,7 @@ get_partition_paths <- function(df) { # create a data.frame of the paths so it can be filtered df_of_partitions = split_string_into_df(lf) # infer the types - df_of_partitions = type.convert(df_of_partitions, as.is=TRUE) + df_of_partitions = utils::type.convert(df_of_partitions, as.is=TRUE) # if there is a filter operation, filter the above to figure out allowed_paths = df_of_partitions$.disk.frame.sub.path diff --git a/R/write_disk.frame.r b/R/write_disk.frame.r index 96a8784..7d38e0c 100644 --- a/R/write_disk.frame.r +++ b/R/write_disk.frame.r @@ -12,6 +12,7 @@ #' @param ... passed to cmap.disk.frame #' @export #' @import fst fs +#' @importFrom dplyr group_map #' @importFrom glue glue #' @examples #' cars.df = as.disk.frame(cars) @@ -51,7 +52,7 @@ write_disk.frame <- function( tmp_dir_to_write = tempfile(as.character(.y)) tmp = .x %>% group_by(!!!syms(partitionby)) %>% - group_map(~{ + dplyr::group_map(~{ # convert group keys to path tmp_path = lapply(names(.y), function(n) { sprintf("%s=%s", n, .y[, n]) @@ -79,7 +80,7 @@ write_disk.frame <- function( partitioned_files %>% group_by(partition_path) %>% - group_map(function(df, grp) { + dplyr::group_map(function(df, grp) { mapply(function(file, i) { outfile = file.path(outdir, grp$partition_path, paste0(i, ".fst")) if(!dir.exists(file.path(outdir, grp$partition_path))) { @@ -116,8 +117,6 @@ write_disk.frame <- function( overwrite = TRUE, shardby = shardby, compress = compress, - shardby_function=shardby_function, - sort_splits=sort_splits, desc_vars=desc_vars, ... ) } diff --git a/R/zzz.r b/R/zzz.r index 7c82cd9..077b1cf 100644 --- a/R/zzz.r +++ b/R/zzz.r @@ -58,7 +58,13 @@ globalVariables(c( "pathB", "w", "xid", - "yid")) + "yid", + "paths", + ".disk.frame.sub.path", + "fullpath", + ".check", + "partition_path" + )) #' @useDynLib disk.frame #' @importFrom Rcpp evalCpp diff --git a/docs/404.html b/docs/404.html index 7a1a321..f6e7493 100644 --- a/docs/404.html +++ b/docs/404.html @@ -32,7 +32,7 @@ disk.frame - 0.6.1 + 0.6.999.999 diff --git a/docs/LICENSE-text.html b/docs/LICENSE-text.html index 9085d47..8f9ec80 100644 --- a/docs/LICENSE-text.html +++ b/docs/LICENSE-text.html @@ -17,7 +17,7 @@ disk.frame - 0.6.1 + 0.6.999.999 diff --git a/docs/articles/01-intro.html b/docs/articles/01-intro.html index fa24bbe..f67eac4 100644 --- a/docs/articles/01-intro.html +++ b/docs/articles/01-intro.html @@ -33,7 +33,7 @@ disk.frame - 0.6.1 + 0.6.999.999 diff --git a/docs/articles/02-intro-disk-frame.html b/docs/articles/02-intro-disk-frame.html index aadbfeb..3608ddb 100644 --- a/docs/articles/02-intro-disk-frame.html +++ b/docs/articles/02-intro-disk-frame.html @@ -33,7 +33,7 @@ disk.frame - 0.6.1 + 0.6.999.999 @@ -555,26 +555,26 @@

Sampling
 flights.df %>% sample_frac(0.01) %>% collect %>% head
 #>    year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
-#> 1: 2013     5  16      930            906        24     1235           1239
-#> 2: 2013    12  30      818            810         8     1008            955
-#> 3: 2013     2   6     1238           1240        -2     1536           1555
-#> 4: 2013    12  20      921            854        27     1226           1212
-#> 5: 2013     1  15     1452           1500        -8     1625           1619
-#> 6: 2013     2   3     1340           1320        20     1511           1509
+#> 1: 2013     2   1      622            625        -3      722            730
+#> 2: 2013     5  18      949            955        -6     1115           1140
+#> 3: 2013     2   3     1930           1932        -2     2211           2230
+#> 4: 2013     1   2      628            630        -2      935            932
+#> 5: 2013     5  13     1819           1825        -6     2041           2146
+#> 6: 2013     5  23       NA           1329        NA       NA           1618
 #>    arr_delay carrier flight tailnum origin dest air_time distance hour minute
-#> 1:        -4      B6    641  N806JB    JFK  SFO      343     2586    9      6
-#> 2:        13      EV   5463  N740EV    LGA  BNA      142      764    8     10
-#> 3:       -19      AA   2041  N5EBAA    JFK  MIA      150     1089   12     40
-#> 4:        14      UA    997  N511UA    EWR  LAX      346     2454    8     54
-#> 5:         6      US   2179  N702UW    LGA  DCA       56      214   15      0
-#> 6:         2      EV   4628  N13123    EWR  STL      132      872   13     20
+#> 1:        -8      WN   3127  N723SW    EWR  BWI       45      169    6     25
+#> 2:       -25      AA    319  N539AA    LGA  ORD      110      733    9     55
+#> 3:       -19      UA   1416  N24715    EWR  IAH      201     1400   19     32
+#> 4:         3      DL   2137  N912DE    LGA  TPA      159     1010    6     30
+#> 5:       -65      UA   1075  N24702    EWR  SNA      286     2434   18     25
+#> 6:        NA      UA   1122            EWR  PBI       NA     1023   13     29
 #>              time_hour
-#> 1: 2013-05-16 13:00:00
-#> 2: 2013-12-30 13:00:00
-#> 3: 2013-02-06 17:00:00
-#> 4: 2013-12-20 13:00:00
-#> 5: 2013-01-15 20:00:00
-#> 6: 2013-02-03 18:00:00
+#> 1: 2013-02-01 11:00:00 +#> 2: 2013-05-18 13:00:00 +#> 3: 2013-02-04 00:00:00 +#> 4: 2013-01-02 11:00:00 +#> 5: 2013-05-13 22:00:00 +#> 6: 2013-05-23 17:00:00

Writing Data diff --git a/docs/articles/03-concepts.html b/docs/articles/03-concepts.html index 2a9aefa..ff39373 100644 --- a/docs/articles/03-concepts.html +++ b/docs/articles/03-concepts.html @@ -33,7 +33,7 @@ disk.frame - 0.6.1 + 0.6.999.999

diff --git a/docs/articles/04-ingesting-data.html b/docs/articles/04-ingesting-data.html index 3c7635a..c45ac0d 100644 --- a/docs/articles/04-ingesting-data.html +++ b/docs/articles/04-ingesting-data.html @@ -33,7 +33,7 @@ disk.frame - 0.6.1 + 0.6.999.999 diff --git a/docs/articles/05-data-table-syntax.html b/docs/articles/05-data-table-syntax.html index 68574b8..40cb549 100644 --- a/docs/articles/05-data-table-syntax.html +++ b/docs/articles/05-data-table-syntax.html @@ -33,7 +33,7 @@ disk.frame - 0.6.1 + 0.6.999.999 diff --git a/docs/articles/index.html b/docs/articles/index.html index e505d22..b3365d4 100644 --- a/docs/articles/index.html +++ b/docs/articles/index.html @@ -17,7 +17,7 @@ disk.frame - 0.6.1 + 0.6.999.999 diff --git a/docs/authors.html b/docs/authors.html index 72268a0..4a6d5e5 100644 --- a/docs/authors.html +++ b/docs/authors.html @@ -17,7 +17,7 @@ disk.frame - 0.6.1 + 0.6.999.999 @@ -110,13 +110,13 @@

Citation

ZJ D (2022). disk.frame: Larger-than-RAM Disk-Based Data Manipulation Framework. -R package version 0.6.1, https://diskframe.com. +R package version 0.6.999.999, https://diskframe.com.

@Manual{,
   title = {disk.frame: Larger-than-RAM Disk-Based Data Manipulation Framework},
   author = {Dai ZJ},
   year = {2022},
-  note = {R package version 0.6.1},
+  note = {R package version 0.6.999.999},
   url = {https://diskframe.com},
 }
diff --git a/docs/index.html b/docs/index.html index a664a0a..c636c75 100644 --- a/docs/index.html +++ b/docs/index.html @@ -37,7 +37,7 @@ disk.frame - 0.6.1 + 0.6.999.999 @@ -248,15 +248,12 @@

dplyr verbsfilter(year == 2013) %>% mutate(origin_dest = paste0(origin, dest)) %>% head(2) -#> year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time arr_delay -#> 1: 2013 1 1 517 515 2 830 819 11 -#> 2: 2013 1 1 533 529 4 850 830 20 -#> carrier flight tailnum origin dest air_time distance hour minute -#> 1: UA 1545 N14228 EWR IAH 227 1400 5 15 -#> 2: UA 1714 N24211 LGA IAH 227 1416 5 29 -#> time_hour origin_dest -#> 1: 2013-01-01 05:00:00 EWRIAH -#> 2: 2013-01-01 05:00:00 LGAIAH +#> year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time arr_delay carrier flight tailnum +#> 1: 2013 1 1 517 515 2 830 819 11 UA 1545 N14228 +#> 2: 2013 1 1 533 529 4 850 830 20 UA 1714 N24211 +#> origin dest air_time distance hour minute time_hour origin_dest +#> 1: EWR IAH 227 1400 5 15 2013-01-01 05:00:00 EWRIAH +#> 2: LGA IAH 227 1416 5 29 2013-01-01 05:00:00 LGAIAH

Group-by @@ -407,26 +404,22 @@

Basic info
 # where is the disk.frame stored
 attr(flights.df, "path")
-#> [1] "C:\\Users\\RTX2080\\AppData\\Local\\Temp\\Rtmp6HjtTk\\file16141b3219fc.df"

+#> [1] "C:\\Users\\RTX2080\\AppData\\Local\\Temp\\Rtmp0yOSxO\\filecf453d9d99.df"

A number of data.frame functions are implemented for disk.frame

 # get first few rows
 head(flights.df, 1)
-#>    year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time arr_delay
-#> 1: 2013     1   1      517            515         2      830            819        11
-#>    carrier flight tailnum origin dest air_time distance hour minute
-#> 1:      UA   1545  N14228    EWR  IAH      227     1400    5     15
-#>              time_hour
-#> 1: 2013-01-01 05:00:00
+#> year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time arr_delay carrier flight tailnum +#> 1: 2013 1 1 517 515 2 830 819 11 UA 1545 N14228 +#> origin dest air_time distance hour minute time_hour +#> 1: EWR IAH 227 1400 5 15 2013-01-01 05:00:00
 # get last few rows
 tail(flights.df, 1)
-#>    year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time arr_delay
-#> 1: 2013     9  30       NA            840        NA       NA           1020        NA
-#>    carrier flight tailnum origin dest air_time distance hour minute
-#> 1:      MQ   3531  N839MQ    LGA  RDU       NA      431    8     40
-#>              time_hour
-#> 1: 2013-09-30 08:00:00
+#> year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time arr_delay carrier flight tailnum +#> 1: 2013 9 30 NA 840 NA NA 1020 NA MQ 3531 N839MQ +#> origin dest air_time distance hour minute time_hour +#> 1: LGA RDU NA 431 8 40 2013-09-30 08:00:00
 # number of rows
 nrow(flights.df)
diff --git a/docs/pkgdown.yml b/docs/pkgdown.yml
index 961402e..ec31e3e 100644
--- a/docs/pkgdown.yml
+++ b/docs/pkgdown.yml
@@ -14,5 +14,5 @@ articles:
   10-group-by: 10-group-by.html
   11-custom-group-by: 11-custom-group-by.html
   88-trouble-shooting: 88-trouble-shooting.html
-last_built: 2022-02-01T11:57Z
+last_built: 2022-02-02T10:59Z
 
diff --git a/docs/reference/add_chunk.html b/docs/reference/add_chunk.html
index 93564a5..2228be8 100644
--- a/docs/reference/add_chunk.html
+++ b/docs/reference/add_chunk.html
@@ -18,7 +18,7 @@
       
       
         disk.frame
-        0.6.1
+        0.6.999.999
       
     
@@ -135,12 +135,12 @@

Examples

# add a chunk to diskf add_chunk(diskf, cars) -#> path: "C:\Users\RTX2080\AppData\Local\Temp\RtmpC4Yg8P/tmp_add_chunk" +#> path: "C:\Users\RTX2080\AppData\Local\Temp\Rtmp2lCstP/tmp_add_chunk" #> nchunks: 1 #> nrow (at source): 50 #> ncol (at source): 2 add_chunk(diskf, cars) -#> path: "C:\Users\RTX2080\AppData\Local\Temp\RtmpC4Yg8P/tmp_add_chunk" +#> path: "C:\Users\RTX2080\AppData\Local\Temp\Rtmp2lCstP/tmp_add_chunk" #> nchunks: 2 #> nrow (at source): 100 #> ncol (at source): 2 @@ -154,12 +154,12 @@

Examples

# you wish to add multiple chunk in parralel add_chunk(df2, data.frame(chunk=1), 1) -#> path: "C:\Users\RTX2080\AppData\Local\Temp\RtmpC4Yg8P/tmp_add_chunk2" +#> path: "C:\Users\RTX2080\AppData\Local\Temp\Rtmp2lCstP/tmp_add_chunk2" #> nchunks: 1 #> nrow (at source): 1 #> ncol (at source): 1 add_chunk(df2, data.frame(chunk=2), 3) -#> path: "C:\Users\RTX2080\AppData\Local\Temp\RtmpC4Yg8P/tmp_add_chunk2" +#> path: "C:\Users\RTX2080\AppData\Local\Temp\Rtmp2lCstP/tmp_add_chunk2" #> nchunks: 2 #> nrow (at source): 2 #> ncol (at source): 1 diff --git a/docs/reference/as.data.frame.disk.frame.html b/docs/reference/as.data.frame.disk.frame.html index 586cd04..7f2dffe 100644 --- a/docs/reference/as.data.frame.disk.frame.html +++ b/docs/reference/as.data.frame.disk.frame.html @@ -17,7 +17,7 @@ disk.frame - 0.6.1 + 0.6.999.999 diff --git a/docs/reference/as.data.table.disk.frame.html b/docs/reference/as.data.table.disk.frame.html index aaf5994..0d3427a 100644 --- a/docs/reference/as.data.table.disk.frame.html +++ b/docs/reference/as.data.table.disk.frame.html @@ -17,7 +17,7 @@ disk.frame - 0.6.1 + 0.6.999.999 diff --git a/docs/reference/as.disk.frame.html b/docs/reference/as.disk.frame.html index 780f6ec..86693e5 100644 --- a/docs/reference/as.disk.frame.html +++ b/docs/reference/as.disk.frame.html @@ -17,7 +17,7 @@ disk.frame - 0.6.1 + 0.6.999.999 diff --git a/docs/reference/bind_rows.disk.frame.html b/docs/reference/bind_rows.disk.frame.html index 26ccb67..0143bc9 100644 --- a/docs/reference/bind_rows.disk.frame.html +++ b/docs/reference/bind_rows.disk.frame.html @@ -17,7 +17,7 @@ disk.frame - 0.6.1 + 0.6.999.999 diff --git a/docs/reference/chunk_group_by.html b/docs/reference/chunk_group_by.html index 4560d29..f596c04 100644 --- a/docs/reference/chunk_group_by.html +++ b/docs/reference/chunk_group_by.html @@ -34,7 +34,7 @@ disk.frame - 0.6.1 + 0.6.999.999 diff --git a/docs/reference/cmap.html b/docs/reference/cmap.html index 1d06c61..0161dca 100644 --- a/docs/reference/cmap.html +++ b/docs/reference/cmap.html @@ -21,7 +21,7 @@ disk.frame - 0.6.1 + 0.6.999.999 @@ -257,7 +257,7 @@

Examples

# return the first row of each chunk eagerly as list cmap(cars.df, ~.x[1,], lazy = FALSE) -#> path: "C:\Users\RTX2080\AppData\Local\Temp\RtmpC4Yg8P\file126438e21837.df" +#> path: "C:\Users\RTX2080\AppData\Local\Temp\Rtmp2lCstP\file1710252959dd.df" #> nchunks: 6 #> nrow (at source): 50 #> ncol (at source): 2 diff --git a/docs/reference/cmap2.html b/docs/reference/cmap2.html index d1c8aca..49aa645 100644 --- a/docs/reference/cmap2.html +++ b/docs/reference/cmap2.html @@ -18,7 +18,7 @@ disk.frame - 0.6.1 + 0.6.999.999 diff --git a/docs/reference/collect.html b/docs/reference/collect.html index f3dce80..ab12f85 100644 --- a/docs/reference/collect.html +++ b/docs/reference/collect.html @@ -20,7 +20,7 @@ disk.frame - 0.6.1 + 0.6.999.999 diff --git a/docs/reference/colnames.html b/docs/reference/colnames.html index 29bd615..637ba39 100644 --- a/docs/reference/colnames.html +++ b/docs/reference/colnames.html @@ -20,7 +20,7 @@ disk.frame - 0.6.1 + 0.6.999.999 diff --git a/docs/reference/compute.disk.frame.html b/docs/reference/compute.disk.frame.html index 82d13a1..3896951 100644 --- a/docs/reference/compute.disk.frame.html +++ b/docs/reference/compute.disk.frame.html @@ -17,7 +17,7 @@ disk.frame - 0.6.1 + 0.6.999.999 diff --git a/docs/reference/create_chunk_mapper.html b/docs/reference/create_chunk_mapper.html index b2df53e..a3bca37 100644 --- a/docs/reference/create_chunk_mapper.html +++ b/docs/reference/create_chunk_mapper.html @@ -17,7 +17,7 @@ disk.frame - 0.6.1 + 0.6.999.999 diff --git a/docs/reference/csv_to_disk.frame.html b/docs/reference/csv_to_disk.frame.html index 6291bfe..7c88d8c 100644 --- a/docs/reference/csv_to_disk.frame.html +++ b/docs/reference/csv_to_disk.frame.html @@ -17,7 +17,7 @@ disk.frame - 0.6.1 + 0.6.999.999 diff --git a/docs/reference/delete.html b/docs/reference/delete.html index 1b43be9..c36e9bc 100644 --- a/docs/reference/delete.html +++ b/docs/reference/delete.html @@ -17,7 +17,7 @@ disk.frame - 0.6.1 + 0.6.999.999 diff --git a/docs/reference/df_ram_size.html b/docs/reference/df_ram_size.html index f9a9fb4..6c636c9 100644 --- a/docs/reference/df_ram_size.html +++ b/docs/reference/df_ram_size.html @@ -17,7 +17,7 @@ disk.frame - 0.6.1 + 0.6.999.999 diff --git a/docs/reference/dfglm.html b/docs/reference/dfglm.html index f3e79f4..da652dc 100644 --- a/docs/reference/dfglm.html +++ b/docs/reference/dfglm.html @@ -18,7 +18,7 @@ disk.frame - 0.6.1 + 0.6.999.999 diff --git a/docs/reference/disk.frame.html b/docs/reference/disk.frame.html index 30b2f5a..2912acf 100644 --- a/docs/reference/disk.frame.html +++ b/docs/reference/disk.frame.html @@ -17,7 +17,7 @@ disk.frame - 0.6.1 + 0.6.999.999 @@ -111,7 +111,7 @@

Arguments

Examples

path = file.path(tempdir(),"cars")
 as.disk.frame(cars, outdir=path, overwrite = TRUE, nchunks = 2)
-#> path: "C:\Users\RTX2080\AppData\Local\Temp\RtmpC4Yg8P/cars"
+#> path: "C:\Users\RTX2080\AppData\Local\Temp\Rtmp2lCstP/cars"
 #> nchunks: 2
 #> nrow (at source): 50
 #> ncol (at source): 2
diff --git a/docs/reference/dplyr_verbs.html b/docs/reference/dplyr_verbs.html
index fbaf09b..755512a 100644
--- a/docs/reference/dplyr_verbs.html
+++ b/docs/reference/dplyr_verbs.html
@@ -18,7 +18,7 @@
       
       
         disk.frame
-        0.6.1
+        0.6.999.999
       
     
diff --git a/docs/reference/evalparseglue.html b/docs/reference/evalparseglue.html index 22f6201..b2fd58c 100644 --- a/docs/reference/evalparseglue.html +++ b/docs/reference/evalparseglue.html @@ -17,7 +17,7 @@ disk.frame - 0.6.1 + 0.6.999.999 diff --git a/docs/reference/find_globals_recursively.html b/docs/reference/find_globals_recursively.html index 57361cf..094f26d 100644 --- a/docs/reference/find_globals_recursively.html +++ b/docs/reference/find_globals_recursively.html @@ -17,7 +17,7 @@ disk.frame - 0.6.1 + 0.6.999.999 diff --git a/docs/reference/foverlaps.disk.frame.html b/docs/reference/foverlaps.disk.frame.html index f968053..fa5ddd0 100644 --- a/docs/reference/foverlaps.disk.frame.html +++ b/docs/reference/foverlaps.disk.frame.html @@ -17,7 +17,7 @@ disk.frame - 0.6.1 + 0.6.999.999 diff --git a/docs/reference/gen_datatable_synthetic.html b/docs/reference/gen_datatable_synthetic.html index 642d489..e194526 100644 --- a/docs/reference/gen_datatable_synthetic.html +++ b/docs/reference/gen_datatable_synthetic.html @@ -17,7 +17,7 @@ disk.frame - 0.6.1 + 0.6.999.999 diff --git a/docs/reference/get_chunk.html b/docs/reference/get_chunk.html index 4dc6f17..d41a992 100644 --- a/docs/reference/get_chunk.html +++ b/docs/reference/get_chunk.html @@ -17,7 +17,7 @@ disk.frame - 0.6.1 + 0.6.999.999 @@ -99,7 +99,7 @@

Obtain one chunk by chunk id

get_chunk(...)
 
 # S3 method for disk.frame
-get_chunk(df, n, keep = NULL, full.names = FALSE, ...)
+get_chunk(df, n, keep = NULL, full.names = FALSE, ..., partitioned_info = NULL)
@@ -114,6 +114,8 @@

Arguments

the columns to keep

full.names

whether n is the full path to the chunks or just a relative path file name. Ignored if n is numeric

+
partitioned_info
+

for internal use only. It's a data.frame used to help with filtering by partitions

diff --git a/docs/reference/get_chunk_ids.html b/docs/reference/get_chunk_ids.html index d1ea2e3..01e4fcb 100644 --- a/docs/reference/get_chunk_ids.html +++ b/docs/reference/get_chunk_ids.html @@ -17,7 +17,7 @@ disk.frame - 0.6.1 + 0.6.999.999
@@ -117,20 +117,22 @@

Examples

# return the integer-string chunk IDs get_chunk_ids(cars.df) -#> [1] "1" "2" "3" "4" "5" "6" +#> 1.fst 2.fst 3.fst 4.fst 5.fst 6.fst +#> "1" "2" "3" "4" "5" "6" # return the file name chunk IDs get_chunk_ids(cars.df, full.names = TRUE) -#> [1] "C:\\Users\\RTX2080\\AppData\\Local\\Temp\\RtmpC4Yg8P\\file12644d401fcc.df/1.fst" -#> [2] "C:\\Users\\RTX2080\\AppData\\Local\\Temp\\RtmpC4Yg8P\\file12644d401fcc.df/2.fst" -#> [3] "C:\\Users\\RTX2080\\AppData\\Local\\Temp\\RtmpC4Yg8P\\file12644d401fcc.df/3.fst" -#> [4] "C:\\Users\\RTX2080\\AppData\\Local\\Temp\\RtmpC4Yg8P\\file12644d401fcc.df/4.fst" -#> [5] "C:\\Users\\RTX2080\\AppData\\Local\\Temp\\RtmpC4Yg8P\\file12644d401fcc.df/5.fst" -#> [6] "C:\\Users\\RTX2080\\AppData\\Local\\Temp\\RtmpC4Yg8P\\file12644d401fcc.df/6.fst" +#> [1] "C:\\Users\\RTX2080\\AppData\\Local\\Temp\\Rtmp2lCstP\\file17106b247014.df/1.fst" +#> [2] "C:\\Users\\RTX2080\\AppData\\Local\\Temp\\Rtmp2lCstP\\file17106b247014.df/2.fst" +#> [3] "C:\\Users\\RTX2080\\AppData\\Local\\Temp\\Rtmp2lCstP\\file17106b247014.df/3.fst" +#> [4] "C:\\Users\\RTX2080\\AppData\\Local\\Temp\\Rtmp2lCstP\\file17106b247014.df/4.fst" +#> [5] "C:\\Users\\RTX2080\\AppData\\Local\\Temp\\Rtmp2lCstP\\file17106b247014.df/5.fst" +#> [6] "C:\\Users\\RTX2080\\AppData\\Local\\Temp\\Rtmp2lCstP\\file17106b247014.df/6.fst" # return the file name chunk IDs with file extension get_chunk_ids(cars.df, strip_extension = FALSE) -#> [1] "1.fst" "2.fst" "3.fst" "4.fst" "5.fst" "6.fst" +#> 1.fst 2.fst 3.fst 4.fst 5.fst 6.fst +#> "1.fst" "2.fst" "3.fst" "4.fst" "5.fst" "6.fst" # clean up cars.df delete(cars.df) diff --git a/docs/reference/get_partition_paths.html b/docs/reference/get_partition_paths.html new file mode 100644 index 0000000..8ff4d44 --- /dev/null +++ b/docs/reference/get_partition_paths.html @@ -0,0 +1,132 @@ + +Get the partitioning structure of a folder — get_partition_paths • disk.frame + + +
+
+ + + +
+
+ + +
+

Get the partitioning structure of a folder

+
+ +
+
get_partition_paths(df)
+
+ +
+

Arguments

+
df
+

a disk.frame whose paths will be used to determine if it' s +folder-partitioned disk.frame

+
+ +
+ +
+ + +
+ +
+

Site built with pkgdown 2.0.2.

+
+ +
+ + + + + + + + diff --git a/docs/reference/group_by.html b/docs/reference/group_by.html index 2ad41d2..f32ba71 100644 --- a/docs/reference/group_by.html +++ b/docs/reference/group_by.html @@ -20,7 +20,7 @@ disk.frame - 0.6.1 + 0.6.999.999 diff --git a/docs/reference/groups.disk.frame.html b/docs/reference/groups.disk.frame.html index 30384b4..e490ad5 100644 --- a/docs/reference/groups.disk.frame.html +++ b/docs/reference/groups.disk.frame.html @@ -17,7 +17,7 @@ disk.frame - 0.6.1 + 0.6.999.999 diff --git a/docs/reference/head_tail.html b/docs/reference/head_tail.html index 521dfe0..32de0af 100644 --- a/docs/reference/head_tail.html +++ b/docs/reference/head_tail.html @@ -17,7 +17,7 @@ disk.frame - 0.6.1 + 0.6.999.999 diff --git a/docs/reference/index.html b/docs/reference/index.html index 18dd32f..6900fdb 100644 --- a/docs/reference/index.html +++ b/docs/reference/index.html @@ -17,7 +17,7 @@ disk.frame - 0.6.1 + 0.6.999.999 @@ -196,6 +196,10 @@

All functions get_chunk_ids()

Get the chunk IDs and files names

+ +

get_partition_paths()

+ +

Get the partitioning structure of a folder

groups(<disk.frame>)

@@ -244,6 +248,10 @@

All functions overwrite_check()

Check if the outdir exists or not

+ +

partition_filter()

+ +

Filter the dataset based on folder partitions

play()

@@ -300,6 +308,10 @@

All functions show_ceremony() ceremony_text() show_boilerplate() insert_ceremony()

Show the code to setup disk.frame

+ +

split_string_into_df()

+ +

Turn a string of the form /partion1=val/partion2=val2 into data.frame

srckeep()

diff --git a/docs/reference/is_disk.frame.html b/docs/reference/is_disk.frame.html index 3caaa6e..a533983 100644 --- a/docs/reference/is_disk.frame.html +++ b/docs/reference/is_disk.frame.html @@ -17,7 +17,7 @@ disk.frame - 0.6.1 + 0.6.999.999 diff --git a/docs/reference/join.html b/docs/reference/join.html index 2ce46ce..b529a90 100644 --- a/docs/reference/join.html +++ b/docs/reference/join.html @@ -17,7 +17,7 @@ disk.frame - 0.6.1 + 0.6.999.999 diff --git a/docs/reference/make_glm_streaming_fn.html b/docs/reference/make_glm_streaming_fn.html index eef2b62..7da36ad 100644 --- a/docs/reference/make_glm_streaming_fn.html +++ b/docs/reference/make_glm_streaming_fn.html @@ -17,7 +17,7 @@ disk.frame - 0.6.1 + 0.6.999.999 diff --git a/docs/reference/merge.disk.frame.html b/docs/reference/merge.disk.frame.html index 0d3036b..2d9a833 100644 --- a/docs/reference/merge.disk.frame.html +++ b/docs/reference/merge.disk.frame.html @@ -17,7 +17,7 @@ disk.frame - 0.6.1 + 0.6.999.999 diff --git a/docs/reference/move_to.html b/docs/reference/move_to.html index cff2ae4..c65a953 100644 --- a/docs/reference/move_to.html +++ b/docs/reference/move_to.html @@ -17,7 +17,7 @@ disk.frame - 0.6.1 + 0.6.999.999 diff --git a/docs/reference/nchunks.html b/docs/reference/nchunks.html index fc2f3ae..38f5361 100644 --- a/docs/reference/nchunks.html +++ b/docs/reference/nchunks.html @@ -17,7 +17,7 @@ disk.frame - 0.6.1 + 0.6.999.999 diff --git a/docs/reference/ncol_nrow.html b/docs/reference/ncol_nrow.html index 04baa3e..d509b27 100644 --- a/docs/reference/ncol_nrow.html +++ b/docs/reference/ncol_nrow.html @@ -17,7 +17,7 @@ disk.frame - 0.6.1 + 0.6.999.999 diff --git a/docs/reference/one-stage-group-by-verbs.html b/docs/reference/one-stage-group-by-verbs.html index b16d4eb..5673efc 100644 --- a/docs/reference/one-stage-group-by-verbs.html +++ b/docs/reference/one-stage-group-by-verbs.html @@ -19,7 +19,7 @@ disk.frame - 0.6.1 + 0.6.999.999 diff --git a/docs/reference/overwrite_check.html b/docs/reference/overwrite_check.html index 5bc041f..5edf126 100644 --- a/docs/reference/overwrite_check.html +++ b/docs/reference/overwrite_check.html @@ -17,7 +17,7 @@ disk.frame - 0.6.1 + 0.6.999.999 diff --git a/docs/reference/partition_filter.html b/docs/reference/partition_filter.html new file mode 100644 index 0000000..95ce4d0 --- /dev/null +++ b/docs/reference/partition_filter.html @@ -0,0 +1,133 @@ + +Filter the dataset based on folder partitions — partition_filter • disk.frame + + +
+
+ + + +
+
+ + +
+

Filter the dataset based on folder partitions

+
+ +
+
partition_filter(x, ...)
+
+ +
+

Arguments

+
x
+

a disk.frame

+
...
+

filtering conditions for filtering the disk.frame at (folder) partition level

+
+ +
+ +
+ + +
+ +
+

Site built with pkgdown 2.0.2.

+
+ +
+ + + + + + + + diff --git a/docs/reference/play.html b/docs/reference/play.html index 6a06806..326a0a2 100644 --- a/docs/reference/play.html +++ b/docs/reference/play.html @@ -17,7 +17,7 @@ disk.frame - 0.6.1 + 0.6.999.999 diff --git a/docs/reference/print.disk.frame.html b/docs/reference/print.disk.frame.html index 2c5f1bd..9e5b9bb 100644 --- a/docs/reference/print.disk.frame.html +++ b/docs/reference/print.disk.frame.html @@ -17,7 +17,7 @@ disk.frame - 0.6.1 + 0.6.999.999 diff --git a/docs/reference/pull.disk.frame.html b/docs/reference/pull.disk.frame.html index c93de3c..bd63b61 100644 --- a/docs/reference/pull.disk.frame.html +++ b/docs/reference/pull.disk.frame.html @@ -17,7 +17,7 @@ disk.frame - 0.6.1 + 0.6.999.999 diff --git a/docs/reference/purrr_as_mapper.html b/docs/reference/purrr_as_mapper.html index ab7b96e..ee2f372 100644 --- a/docs/reference/purrr_as_mapper.html +++ b/docs/reference/purrr_as_mapper.html @@ -17,7 +17,7 @@ disk.frame - 0.6.1 + 0.6.999.999 diff --git a/docs/reference/rbindlist.disk.frame.html b/docs/reference/rbindlist.disk.frame.html index af351f4..b8ce4aa 100644 --- a/docs/reference/rbindlist.disk.frame.html +++ b/docs/reference/rbindlist.disk.frame.html @@ -17,7 +17,7 @@ disk.frame - 0.6.1 + 0.6.999.999 diff --git a/docs/reference/rechunk.html b/docs/reference/rechunk.html index 80eeda6..53a6ae6 100644 --- a/docs/reference/rechunk.html +++ b/docs/reference/rechunk.html @@ -17,7 +17,7 @@ disk.frame - 0.6.1 + 0.6.999.999 @@ -126,8 +126,8 @@

Examples

# re-chunking cars.df to 3 chunks, done "in-place" to the same folder as cars.df rechunk(cars.df, 3) -#> files have been backed up to temporary dir C:\Users\RTX2080\AppData\Local\Temp\RtmpC4Yg8P\back_up_tmp_dir126429f0440c. You can recover there files until you restart your R session -#> path: "C:\Users\RTX2080\AppData\Local\Temp\RtmpC4Yg8P\file12641cab3271.df" +#> files have been backed up to temporary dir C:\Users\RTX2080\AppData\Local\Temp\Rtmp2lCstP\back_up_tmp_dir1710762812d3. You can recover there files until you restart your R session +#> path: "C:\Users\RTX2080\AppData\Local\Temp\Rtmp2lCstP\file17106dc327d2.df" #> nchunks: 3 #> nrow (at source): 50 #> ncol (at source): 2 diff --git a/docs/reference/recommend_nchunks.html b/docs/reference/recommend_nchunks.html index 9514ebf..11b63d0 100644 --- a/docs/reference/recommend_nchunks.html +++ b/docs/reference/recommend_nchunks.html @@ -18,7 +18,7 @@ disk.frame - 0.6.1 + 0.6.999.999 diff --git a/docs/reference/remove_chunk.html b/docs/reference/remove_chunk.html index 0cdd331..b7e8f1e 100644 --- a/docs/reference/remove_chunk.html +++ b/docs/reference/remove_chunk.html @@ -17,7 +17,7 @@ disk.frame - 0.6.1 + 0.6.999.999 @@ -116,7 +116,7 @@

Examples

# removes 3rd chunk remove_chunk(cars.df, 3) -#> path: "C:\Users\RTX2080\AppData\Local\Temp\RtmpC4Yg8P\file1264402d4b28.df" +#> path: "C:\Users\RTX2080\AppData\Local\Temp\Rtmp2lCstP\file171025e93bdf.df" #> nchunks: 3 #> nrow (at source): 37 #> ncol (at source): 2 @@ -125,7 +125,7 @@

Examples

# removes 4th chunk remove_chunk(cars.df, "4.fst") -#> path: "C:\Users\RTX2080\AppData\Local\Temp\RtmpC4Yg8P\file1264402d4b28.df" +#> path: "C:\Users\RTX2080\AppData\Local\Temp\Rtmp2lCstP\file171025e93bdf.df" #> nchunks: 2 #> nrow (at source): 26 #> ncol (at source): 2 @@ -134,7 +134,7 @@

Examples

# removes 2nd chunk remove_chunk(cars.df, file.path(attr(cars.df, "path", exact=TRUE), "2.fst"), full.names = TRUE) -#> path: "C:\Users\RTX2080\AppData\Local\Temp\RtmpC4Yg8P\file1264402d4b28.df" +#> path: "C:\Users\RTX2080\AppData\Local\Temp\Rtmp2lCstP\file171025e93bdf.df" #> nchunks: 1 #> nrow (at source): 13 #> ncol (at source): 2 diff --git a/docs/reference/sample.html b/docs/reference/sample.html index a3489f0..82e1e23 100644 --- a/docs/reference/sample.html +++ b/docs/reference/sample.html @@ -17,7 +17,7 @@ disk.frame - 0.6.1 + 0.6.999.999 @@ -127,28 +127,28 @@

Examples

collect(sample_frac(cars.df, 0.5)) #> speed dist -#> 1: 7 4 -#> 2: 7 22 +#> 1: 4 10 +#> 2: 7 4 #> 3: 9 10 -#> 4: 4 10 -#> 5: 12 20 -#> 6: 11 28 -#> 7: 13 26 -#> 8: 13 34 -#> 9: 15 26 -#> 10: 15 20 -#> 11: 13 46 -#> 12: 16 32 -#> 13: 18 84 -#> 14: 16 40 -#> 15: 18 56 -#> 16: 18 76 -#> 17: 20 56 -#> 18: 19 46 -#> 19: 19 68 -#> 20: 20 52 -#> 21: 24 70 -#> 22: 24 120 +#> 4: 10 18 +#> 5: 11 17 +#> 6: 12 28 +#> 7: 12 20 +#> 8: 12 14 +#> 9: 16 32 +#> 10: 14 60 +#> 11: 15 20 +#> 12: 14 26 +#> 13: 17 32 +#> 14: 18 42 +#> 15: 17 40 +#> 16: 18 56 +#> 17: 19 68 +#> 18: 20 32 +#> 19: 20 56 +#> 20: 19 46 +#> 21: 24 120 +#> 22: 24 70 #> speed dist # clean up cars.df diff --git a/docs/reference/setup_disk.frame.html b/docs/reference/setup_disk.frame.html index 6f180be..4e6a3d8 100644 --- a/docs/reference/setup_disk.frame.html +++ b/docs/reference/setup_disk.frame.html @@ -17,7 +17,7 @@ disk.frame - 0.6.1 + 0.6.999.999 diff --git a/docs/reference/shard.html b/docs/reference/shard.html index ed2ac68..5d4cfbb 100644 --- a/docs/reference/shard.html +++ b/docs/reference/shard.html @@ -18,7 +18,7 @@ disk.frame - 0.6.1 + 0.6.999.999 diff --git a/docs/reference/shardkey.html b/docs/reference/shardkey.html index 1af8a25..a66f3df 100644 --- a/docs/reference/shardkey.html +++ b/docs/reference/shardkey.html @@ -17,7 +17,7 @@ disk.frame - 0.6.1 + 0.6.999.999 diff --git a/docs/reference/shardkey_equal.html b/docs/reference/shardkey_equal.html index 30e16fb..46a45da 100644 --- a/docs/reference/shardkey_equal.html +++ b/docs/reference/shardkey_equal.html @@ -17,7 +17,7 @@ disk.frame - 0.6.1 + 0.6.999.999 diff --git a/docs/reference/show_ceremony.html b/docs/reference/show_ceremony.html index cfc32e8..3686247 100644 --- a/docs/reference/show_ceremony.html +++ b/docs/reference/show_ceremony.html @@ -17,7 +17,7 @@ disk.frame - 0.6.1 + 0.6.999.999 diff --git a/docs/reference/split_string_into_df.html b/docs/reference/split_string_into_df.html new file mode 100644 index 0000000..9e33093 --- /dev/null +++ b/docs/reference/split_string_into_df.html @@ -0,0 +1,126 @@ + +Turn a string of the form /partion1=val/partion2=val2 into data.frame — split_string_into_df • disk.frame + + +
+
+ + + +
+
+ + +
+

Turn a string of the form /partion1=val/partion2=val2 into data.frame

+
+ +
+
split_string_into_df(path_strs)
+
+ + +
+ +
+ + +
+ +
+

Site built with pkgdown 2.0.2.

+
+ +
+ + + + + + + + diff --git a/docs/reference/srckeep.html b/docs/reference/srckeep.html index 8643052..c8e6ce5 100644 --- a/docs/reference/srckeep.html +++ b/docs/reference/srckeep.html @@ -17,7 +17,7 @@ disk.frame - 0.6.1 + 0.6.999.999 diff --git a/docs/reference/sub-.disk.frame.html b/docs/reference/sub-.disk.frame.html index 51bb55f..231a4d1 100644 --- a/docs/reference/sub-.disk.frame.html +++ b/docs/reference/sub-.disk.frame.html @@ -17,7 +17,7 @@ disk.frame - 0.6.1 + 0.6.999.999 diff --git a/docs/reference/tbl_vars.disk.frame.html b/docs/reference/tbl_vars.disk.frame.html index 1c23b6e..d5d64d0 100644 --- a/docs/reference/tbl_vars.disk.frame.html +++ b/docs/reference/tbl_vars.disk.frame.html @@ -18,7 +18,7 @@ disk.frame - 0.6.1 + 0.6.999.999 diff --git a/docs/reference/write_disk.frame.html b/docs/reference/write_disk.frame.html index 4365dff..187568d 100644 --- a/docs/reference/write_disk.frame.html +++ b/docs/reference/write_disk.frame.html @@ -18,7 +18,7 @@ disk.frame - 0.6.1 + 0.6.999.999 @@ -105,10 +105,8 @@

Write disk.frame to disk

recommend_nchunks(diskf)), overwrite = FALSE, shardby = NULL, + partitionby = NULL, compress = 50, - shardby_function = "hash", - sort_splits = NULL, - desc_vars = NULL, ... ) @@ -127,14 +125,10 @@

Arguments

overwrite output directory

shardby

the columns to shard by

+
partitionby
+

the columns to (folder) partition by

compress

compression ratio for fst files

-
shardby_function
-

splitting of chunks: "hash" for hash function or "sort" for semi-sorted chunks

-
sort_splits
-

for the "sort" shardby function, a dataframe with the split values.

-
desc_vars
-

for the "sort" shardby function, the variables to sort descending.

...

passed to cmap.disk.frame

diff --git a/docs/reference/zip_to_disk.frame.html b/docs/reference/zip_to_disk.frame.html index 4e1388d..73b3980 100644 --- a/docs/reference/zip_to_disk.frame.html +++ b/docs/reference/zip_to_disk.frame.html @@ -20,7 +20,7 @@ disk.frame - 0.6.1 + 0.6.999.999 @@ -144,7 +144,7 @@

Examples

# read every file and convert it to a disk.frame zip.df = zip_to_disk.frame(zipfile, tempfile(fileext = ".df")) -#> Error in unzip(zipfile, list = TRUE): zip file 'C:\Users\RTX2080\AppData\Local\Temp\RtmpC4Yg8P\file126473c130c2.zip' cannot be opened +#> Error in unzip(zipfile, list = TRUE): zip file 'C:\Users\RTX2080\AppData\Local\Temp\Rtmp2lCstP\file171021f94c27.zip' cannot be opened # there is only one csv file so it return a list of one disk.frame zip.df[[1]] diff --git a/man/get_chunk.Rd b/man/get_chunk.Rd index 8e144c6..8646346 100644 --- a/man/get_chunk.Rd +++ b/man/get_chunk.Rd @@ -19,6 +19,8 @@ get_chunk(...) \item{keep}{the columns to keep} \item{full.names}{whether n is the full path to the chunks or just a relative path file name. Ignored if n is numeric} + +\item{partitioned_info}{for internal use only. It's a data.frame used to help with filtering by partitions} } \description{ Obtain one chunk by chunk id diff --git a/man/get_partition_paths.Rd b/man/get_partition_paths.Rd index 5f230f7..33bb5d3 100644 --- a/man/get_partition_paths.Rd +++ b/man/get_partition_paths.Rd @@ -6,6 +6,10 @@ \usage{ get_partition_paths(df) } +\arguments{ +\item{df}{a disk.frame whose paths will be used to determine if it's +folder-partitioned disk.frame} +} \description{ Get the partitioning structure of a folder } diff --git a/man/split_string_into_df.Rd b/man/split_string_into_df.Rd index 345afd6..c18f63c 100644 --- a/man/split_string_into_df.Rd +++ b/man/split_string_into_df.Rd @@ -6,6 +6,9 @@ \usage{ split_string_into_df(path_strs) } +\arguments{ +\item{path_strs}{The paths in string form to break into partition format} +} \description{ Turn a string of the form /partion1=val/partion2=val2 into data.frame } From 174baed849e38114d674cfe74a4d5127584ad2dc Mon Sep 17 00:00:00 2001 From: ZJ Dai Date: Wed, 2 Feb 2022 22:14:20 +1100 Subject: [PATCH 12/16] ready for submission in two weeks --- DESCRIPTION | 4 ++-- NEWS.md | 4 ++++ cran-comments.md | 12 +++++------- 3 files changed, 11 insertions(+), 9 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 9ec2a53..3c33aa0 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,8 +1,8 @@ Type: Package Package: disk.frame Title: Larger-than-RAM Disk-Based Data Manipulation Framework -Version: 0.6.999.999 -Date: 2022-02-01 +Version: 0.7 +Date: 2022-02-015 Authors@R: c( person("Dai", "ZJ", email = "zhuojia.dai@gmail.com", role = c("aut", "cre")), person("Jacky", "Poon", role = c("ctb")) diff --git a/NEWS.md b/NEWS.md index c5e6aeb..db7468a 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,7 @@ +# disk.frame 0.7 +* Partitioned by folder +* Updated R version to 4 + # disk.frame 0.6.1 * Fixed bug with data.table syntax * Auto detection of srckeep in group by diff --git a/cran-comments.md b/cran-comments.md index 05b563d..245c6d7 100644 --- a/cran-comments.md +++ b/cran-comments.md @@ -1,14 +1,12 @@ -## Resubmission 1 for v0.6.0 -* removed the vignette builder which should remove the 1 NOTE - -## Submission for v0.6.0 -* Updated the API for many functions leading to semver update of functions +## Submission for v0.7 +* Implemented partition by folder +* Updated R version to v4.0 ## Test environments * local Windows 11 Pro install, R 4.1.2 -* local Windows 11 Pro install, R devel (as of 2022-01-31) +* local Windows 11 Pro install, R devel (as of 2022-02-01) * local Linux/Ubuntu install, R 4.1.2 -* local Linux/Ubuntu install, R devel (as of 2022-01-31) +* local Linux/Ubuntu install, R devel (as of 2022-02-01) ## R CMD check results There were no ERRORs nor WARNINGs nor NOTE when run locally. From 5ddfcafbb675fc62f2f066f9843848198703e1f3 Mon Sep 17 00:00:00 2001 From: ZJ Dai Date: Wed, 2 Feb 2022 22:34:54 +1100 Subject: [PATCH 13/16] updated site --- docs/404.html | 2 +- docs/LICENSE-text.html | 2 +- docs/articles/01-intro.html | 2 +- docs/articles/02-intro-disk-frame.html | 38 +++++++++---------- docs/articles/03-concepts.html | 2 +- docs/articles/04-ingesting-data.html | 2 +- docs/articles/05-data-table-syntax.html | 2 +- docs/articles/06-vs-dask-juliadb.html | 10 ++--- docs/articles/07-glm.html | 2 +- docs/articles/08-more-epic.html | 2 +- docs/articles/09-convenience-features.html | 2 +- docs/articles/10-group-by.html | 2 +- docs/articles/11-custom-group-by.html | 2 +- docs/articles/88-trouble-shooting.html | 2 +- docs/articles/index.html | 2 +- docs/authors.html | 6 +-- docs/index.html | 39 ++++++++++++-------- docs/news/index.html | 9 ++++- docs/pkgdown.yml | 2 +- docs/reference/add_chunk.html | 10 ++--- docs/reference/as.data.frame.disk.frame.html | 2 +- docs/reference/as.data.table.disk.frame.html | 2 +- docs/reference/as.disk.frame.html | 2 +- docs/reference/bind_rows.disk.frame.html | 2 +- docs/reference/chunk_group_by.html | 2 +- docs/reference/cmap.html | 4 +- docs/reference/cmap2.html | 2 +- docs/reference/collect.html | 2 +- docs/reference/colnames.html | 2 +- docs/reference/compute.disk.frame.html | 2 +- docs/reference/create_chunk_mapper.html | 2 +- docs/reference/csv_to_disk.frame.html | 2 +- docs/reference/delete.html | 2 +- docs/reference/df_ram_size.html | 2 +- docs/reference/dfglm.html | 2 +- docs/reference/disk.frame.html | 4 +- docs/reference/dplyr_verbs.html | 2 +- docs/reference/evalparseglue.html | 2 +- docs/reference/find_globals_recursively.html | 2 +- docs/reference/foverlaps.disk.frame.html | 2 +- docs/reference/gen_datatable_synthetic.html | 2 +- docs/reference/get_chunk.html | 2 +- docs/reference/get_chunk_ids.html | 14 +++---- docs/reference/get_partition_paths.html | 4 +- docs/reference/group_by.html | 2 +- docs/reference/groups.disk.frame.html | 2 +- docs/reference/head_tail.html | 2 +- docs/reference/index.html | 2 +- docs/reference/is_disk.frame.html | 2 +- docs/reference/join.html | 2 +- docs/reference/make_glm_streaming_fn.html | 2 +- docs/reference/merge.disk.frame.html | 2 +- docs/reference/move_to.html | 2 +- docs/reference/nchunks.html | 2 +- docs/reference/ncol_nrow.html | 2 +- docs/reference/one-stage-group-by-verbs.html | 2 +- docs/reference/overwrite_check.html | 2 +- docs/reference/partition_filter.html | 2 +- docs/reference/play.html | 2 +- docs/reference/print.disk.frame.html | 2 +- docs/reference/pull.disk.frame.html | 2 +- docs/reference/purrr_as_mapper.html | 2 +- docs/reference/rbindlist.disk.frame.html | 2 +- docs/reference/rechunk.html | 6 +-- docs/reference/recommend_nchunks.html | 2 +- docs/reference/remove_chunk.html | 8 ++-- docs/reference/sample.html | 32 ++++++++-------- docs/reference/setup_disk.frame.html | 2 +- docs/reference/shard.html | 2 +- docs/reference/shardkey.html | 2 +- docs/reference/shardkey_equal.html | 2 +- docs/reference/show_ceremony.html | 2 +- docs/reference/split_string_into_df.html | 7 +++- docs/reference/srckeep.html | 2 +- docs/reference/sub-.disk.frame.html | 2 +- docs/reference/tbl_vars.disk.frame.html | 2 +- docs/reference/write_disk.frame.html | 2 +- docs/reference/zip_to_disk.frame.html | 4 +- docs/sitemap.xml | 9 +++++ 79 files changed, 178 insertions(+), 152 deletions(-) diff --git a/docs/404.html b/docs/404.html index f6e7493..49d167f 100644 --- a/docs/404.html +++ b/docs/404.html @@ -32,7 +32,7 @@ disk.frame - 0.6.999.999 + 0.7 diff --git a/docs/LICENSE-text.html b/docs/LICENSE-text.html index 8f9ec80..d3b2e16 100644 --- a/docs/LICENSE-text.html +++ b/docs/LICENSE-text.html @@ -17,7 +17,7 @@ disk.frame - 0.6.999.999 + 0.7 diff --git a/docs/articles/01-intro.html b/docs/articles/01-intro.html index f67eac4..35e9a2b 100644 --- a/docs/articles/01-intro.html +++ b/docs/articles/01-intro.html @@ -33,7 +33,7 @@ disk.frame - 0.6.999.999 + 0.7 diff --git a/docs/articles/02-intro-disk-frame.html b/docs/articles/02-intro-disk-frame.html index 3608ddb..adfe01a 100644 --- a/docs/articles/02-intro-disk-frame.html +++ b/docs/articles/02-intro-disk-frame.html @@ -33,7 +33,7 @@ disk.frame - 0.6.999.999 + 0.7 @@ -555,26 +555,26 @@

Sampling
 flights.df %>% sample_frac(0.01) %>% collect %>% head
 #>    year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
-#> 1: 2013     2   1      622            625        -3      722            730
-#> 2: 2013     5  18      949            955        -6     1115           1140
-#> 3: 2013     2   3     1930           1932        -2     2211           2230
-#> 4: 2013     1   2      628            630        -2      935            932
-#> 5: 2013     5  13     1819           1825        -6     2041           2146
-#> 6: 2013     5  23       NA           1329        NA       NA           1618
+#> 1: 2013     5  24     1452           1455        -3     1840           1820
+#> 2: 2013     1  10      939            905        34     1259           1235
+#> 3: 2013     2   4     1911           1915        -4     2117           2111
+#> 4: 2013     5  12     1634           1545        49     1940           1916
+#> 5: 2013     1  16     2315           2250        25       40              7
+#> 6: 2013    12  28      913            900        13     1215           1227
 #>    arr_delay carrier flight tailnum origin dest air_time distance hour minute
-#> 1:        -8      WN   3127  N723SW    EWR  BWI       45      169    6     25
-#> 2:       -25      AA    319  N539AA    LGA  ORD      110      733    9     55
-#> 3:       -19      UA   1416  N24715    EWR  IAH      201     1400   19     32
-#> 4:         3      DL   2137  N912DE    LGA  TPA      159     1010    6     30
-#> 5:       -65      UA   1075  N24702    EWR  SNA      286     2434   18     25
-#> 6:        NA      UA   1122            EWR  PBI       NA     1023   13     29
+#> 1:        20      AA   1769  N372AA    JFK  MIA      165     1089   14     55
+#> 2:        24      VX    407  N622VA    JFK  LAX      349     2475    9      5
+#> 3:         6      9E   3525  N903XJ    JFK  ORD      127      740   19     15
+#> 4:        24      DL   1773  N3739P    JFK  SLC      262     1990   15     45
+#> 5:        33      B6     30  N187JB    JFK  ROC       59      264   22     50
+#> 6:       -12      DL    422  N713TW    JFK  LAX      327     2475    9      0
 #>              time_hour
-#> 1: 2013-02-01 11:00:00
-#> 2: 2013-05-18 13:00:00
-#> 3: 2013-02-04 00:00:00
-#> 4: 2013-01-02 11:00:00
-#> 5: 2013-05-13 22:00:00
-#> 6: 2013-05-23 17:00:00
+#> 1: 2013-05-24 18:00:00 +#> 2: 2013-01-10 14:00:00 +#> 3: 2013-02-05 00:00:00 +#> 4: 2013-05-12 19:00:00 +#> 5: 2013-01-17 03:00:00 +#> 6: 2013-12-28 14:00:00

Writing Data diff --git a/docs/articles/03-concepts.html b/docs/articles/03-concepts.html index ff39373..95c0d13 100644 --- a/docs/articles/03-concepts.html +++ b/docs/articles/03-concepts.html @@ -33,7 +33,7 @@ disk.frame - 0.6.999.999 + 0.7

diff --git a/docs/articles/04-ingesting-data.html b/docs/articles/04-ingesting-data.html index c45ac0d..fb122f5 100644 --- a/docs/articles/04-ingesting-data.html +++ b/docs/articles/04-ingesting-data.html @@ -33,7 +33,7 @@ disk.frame - 0.6.999.999 + 0.7 diff --git a/docs/articles/05-data-table-syntax.html b/docs/articles/05-data-table-syntax.html index 40cb549..3c67ccc 100644 --- a/docs/articles/05-data-table-syntax.html +++ b/docs/articles/05-data-table-syntax.html @@ -33,7 +33,7 @@ disk.frame - 0.6.999.999 + 0.7 diff --git a/docs/articles/06-vs-dask-juliadb.html b/docs/articles/06-vs-dask-juliadb.html index e757449..e8652ba 100644 --- a/docs/articles/06-vs-dask-juliadb.html +++ b/docs/articles/06-vs-dask-juliadb.html @@ -33,7 +33,7 @@ disk.frame - 0.6.1 + 0.7 @@ -189,14 +189,14 @@

disk.framesystem.time(setup_disk.frame()) # ~4s #> The number of workers available for disk.frame is 6 #> user system elapsed -#> 0.14 0.06 2.15 +#> 0.15 0.04 2.41

We note that there is some time needed for disk.frame to start up all the workers. Next we try to convert the largest CSV file to disk.frame format. The file to be converted is about 2.2GB in size

 time_to_convert_disk.frame = system.time(df1 <- csv_to_disk.frame("c:/data/Performance_2004Q3.txt", header = FALSE))[3]
 
 time_to_convert_disk.frame
 #> elapsed 
-#>   27.52
+#> 29.67

Now that we have converted it, we want to a count by the first column. To achieve this we use a “two-stage” aggregation strategy. Note that use keep="V1" to bring only the column V1 into RAM. This avoids the reading of other unnecessary columns and should speed-up the analysis significantly

 time_to_agg_disk.frame = system.time(summ <- df1[,.N, V1, keep = "V1"][, .(N = sum(N)), V1])
@@ -204,7 +204,7 @@ 
disk.frametime_to_agg_disk.frame #> user system elapsed -#> 0.15 0.05 7.84
+#> 0.08 0.01 8.82

We can inspect the result as well.

 summ
@@ -228,7 +228,7 @@ 
disk.framesummarise(N = n()) %>% collect) #> user system elapsed -#> 1.45 0.14 6.15
+#> 1.64 0.13 4.43

However, the dplyr syntax tends to be slightly slower than using data.table syntax. This may be improved as much of the overhead is due to inefficient use of NSE.

diff --git a/docs/articles/07-glm.html b/docs/articles/07-glm.html index 1580af1..01e5f05 100644 --- a/docs/articles/07-glm.html +++ b/docs/articles/07-glm.html @@ -33,7 +33,7 @@ disk.frame - 0.6.1 + 0.7
diff --git a/docs/articles/08-more-epic.html b/docs/articles/08-more-epic.html index 33c55b6..a15ed65 100644 --- a/docs/articles/08-more-epic.html +++ b/docs/articles/08-more-epic.html @@ -33,7 +33,7 @@ disk.frame - 0.6.1 + 0.7 diff --git a/docs/articles/09-convenience-features.html b/docs/articles/09-convenience-features.html index 81fdd50..9e2bcfc 100644 --- a/docs/articles/09-convenience-features.html +++ b/docs/articles/09-convenience-features.html @@ -33,7 +33,7 @@ disk.frame - 0.6.1 + 0.7 diff --git a/docs/articles/10-group-by.html b/docs/articles/10-group-by.html index 7262671..eaa55ce 100644 --- a/docs/articles/10-group-by.html +++ b/docs/articles/10-group-by.html @@ -33,7 +33,7 @@ disk.frame - 0.6.1 + 0.7 diff --git a/docs/articles/11-custom-group-by.html b/docs/articles/11-custom-group-by.html index d9f8d61..8ca8d70 100644 --- a/docs/articles/11-custom-group-by.html +++ b/docs/articles/11-custom-group-by.html @@ -33,7 +33,7 @@ disk.frame - 0.6.1 + 0.7 diff --git a/docs/articles/88-trouble-shooting.html b/docs/articles/88-trouble-shooting.html index b7c39b6..cb894f3 100644 --- a/docs/articles/88-trouble-shooting.html +++ b/docs/articles/88-trouble-shooting.html @@ -33,7 +33,7 @@ disk.frame - 0.6.1 + 0.7 diff --git a/docs/articles/index.html b/docs/articles/index.html index b3365d4..ab7832c 100644 --- a/docs/articles/index.html +++ b/docs/articles/index.html @@ -17,7 +17,7 @@ disk.frame - 0.6.999.999 + 0.7 diff --git a/docs/authors.html b/docs/authors.html index 4a6d5e5..7c8e758 100644 --- a/docs/authors.html +++ b/docs/authors.html @@ -17,7 +17,7 @@ disk.frame - 0.6.999.999 + 0.7 @@ -110,13 +110,13 @@

Citation

ZJ D (2022). disk.frame: Larger-than-RAM Disk-Based Data Manipulation Framework. -R package version 0.6.999.999, https://diskframe.com. +R package version 0.7, https://diskframe.com.

@Manual{,
   title = {disk.frame: Larger-than-RAM Disk-Based Data Manipulation Framework},
   author = {Dai ZJ},
   year = {2022},
-  note = {R package version 0.6.999.999},
+  note = {R package version 0.7},
   url = {https://diskframe.com},
 }
diff --git a/docs/index.html b/docs/index.html index c636c75..f86e68c 100644 --- a/docs/index.html +++ b/docs/index.html @@ -37,7 +37,7 @@ disk.frame - 0.6.999.999 + 0.7 @@ -248,12 +248,15 @@

dplyr verbsfilter(year == 2013) %>% mutate(origin_dest = paste0(origin, dest)) %>% head(2) -#> year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time arr_delay carrier flight tailnum -#> 1: 2013 1 1 517 515 2 830 819 11 UA 1545 N14228 -#> 2: 2013 1 1 533 529 4 850 830 20 UA 1714 N24211 -#> origin dest air_time distance hour minute time_hour origin_dest -#> 1: EWR IAH 227 1400 5 15 2013-01-01 05:00:00 EWRIAH -#> 2: LGA IAH 227 1416 5 29 2013-01-01 05:00:00 LGAIAH +#> year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time arr_delay +#> 1: 2013 1 1 517 515 2 830 819 11 +#> 2: 2013 1 1 533 529 4 850 830 20 +#> carrier flight tailnum origin dest air_time distance hour minute +#> 1: UA 1545 N14228 EWR IAH 227 1400 5 15 +#> 2: UA 1714 N24211 LGA IAH 227 1416 5 29 +#> time_hour origin_dest +#> 1: 2013-01-01 05:00:00 EWRIAH +#> 2: 2013-01-01 05:00:00 LGAIAH

Group-by @@ -404,22 +407,26 @@

Basic info
 # where is the disk.frame stored
 attr(flights.df, "path")
-#> [1] "C:\\Users\\RTX2080\\AppData\\Local\\Temp\\Rtmp0yOSxO\\filecf453d9d99.df"

+#> [1] "C:\\Users\\RTX2080\\AppData\\Local\\Temp\\RtmpQJt0m9\\file393027da33b.df"

A number of data.frame functions are implemented for disk.frame

 # get first few rows
 head(flights.df, 1)
-#>    year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time arr_delay carrier flight tailnum
-#> 1: 2013     1   1      517            515         2      830            819        11      UA   1545  N14228
-#>    origin dest air_time distance hour minute           time_hour
-#> 1:    EWR  IAH      227     1400    5     15 2013-01-01 05:00:00
+#> year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time arr_delay +#> 1: 2013 1 1 517 515 2 830 819 11 +#> carrier flight tailnum origin dest air_time distance hour minute +#> 1: UA 1545 N14228 EWR IAH 227 1400 5 15 +#> time_hour +#> 1: 2013-01-01 05:00:00
 # get last few rows
 tail(flights.df, 1)
-#>    year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time arr_delay carrier flight tailnum
-#> 1: 2013     9  30       NA            840        NA       NA           1020        NA      MQ   3531  N839MQ
-#>    origin dest air_time distance hour minute           time_hour
-#> 1:    LGA  RDU       NA      431    8     40 2013-09-30 08:00:00
+#> year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time arr_delay +#> 1: 2013 9 30 NA 840 NA NA 1020 NA +#> carrier flight tailnum origin dest air_time distance hour minute +#> 1: MQ 3531 N839MQ LGA RDU NA 431 8 40 +#> time_hour +#> 1: 2013-09-30 08:00:00
 # number of rows
 nrow(flights.df)
diff --git a/docs/news/index.html b/docs/news/index.html
index b12731e..551c949 100644
--- a/docs/news/index.html
+++ b/docs/news/index.html
@@ -17,7 +17,7 @@
       
       
         disk.frame
-        0.6.1
+        0.7
       
     
@@ -91,7 +91,12 @@

Changelog

- + +
  • Partitioned by folder
  • +
  • Updated R version to 4
  • +
+
+
  • Fixed bug with data.table syntax
  • Auto detection of srckeep in group by
  • Global detection for group by and summarise
  • diff --git a/docs/pkgdown.yml b/docs/pkgdown.yml index ec31e3e..73b2fde 100644 --- a/docs/pkgdown.yml +++ b/docs/pkgdown.yml @@ -14,5 +14,5 @@ articles: 10-group-by: 10-group-by.html 11-custom-group-by: 11-custom-group-by.html 88-trouble-shooting: 88-trouble-shooting.html -last_built: 2022-02-02T10:59Z +last_built: 2022-02-02T11:20Z diff --git a/docs/reference/add_chunk.html b/docs/reference/add_chunk.html index 2228be8..6f1c0da 100644 --- a/docs/reference/add_chunk.html +++ b/docs/reference/add_chunk.html @@ -18,7 +18,7 @@ disk.frame - 0.6.999.999 + 0.7
@@ -135,12 +135,12 @@

Examples

# add a chunk to diskf add_chunk(diskf, cars) -#> path: "C:\Users\RTX2080\AppData\Local\Temp\Rtmp2lCstP/tmp_add_chunk" +#> path: "C:\Users\RTX2080\AppData\Local\Temp\RtmpUX5lOH/tmp_add_chunk" #> nchunks: 1 #> nrow (at source): 50 #> ncol (at source): 2 add_chunk(diskf, cars) -#> path: "C:\Users\RTX2080\AppData\Local\Temp\Rtmp2lCstP/tmp_add_chunk" +#> path: "C:\Users\RTX2080\AppData\Local\Temp\RtmpUX5lOH/tmp_add_chunk" #> nchunks: 2 #> nrow (at source): 100 #> ncol (at source): 2 @@ -154,12 +154,12 @@

Examples

# you wish to add multiple chunk in parralel add_chunk(df2, data.frame(chunk=1), 1) -#> path: "C:\Users\RTX2080\AppData\Local\Temp\Rtmp2lCstP/tmp_add_chunk2" +#> path: "C:\Users\RTX2080\AppData\Local\Temp\RtmpUX5lOH/tmp_add_chunk2" #> nchunks: 1 #> nrow (at source): 1 #> ncol (at source): 1 add_chunk(df2, data.frame(chunk=2), 3) -#> path: "C:\Users\RTX2080\AppData\Local\Temp\Rtmp2lCstP/tmp_add_chunk2" +#> path: "C:\Users\RTX2080\AppData\Local\Temp\RtmpUX5lOH/tmp_add_chunk2" #> nchunks: 2 #> nrow (at source): 2 #> ncol (at source): 1 diff --git a/docs/reference/as.data.frame.disk.frame.html b/docs/reference/as.data.frame.disk.frame.html index 7f2dffe..e93f149 100644 --- a/docs/reference/as.data.frame.disk.frame.html +++ b/docs/reference/as.data.frame.disk.frame.html @@ -17,7 +17,7 @@ disk.frame - 0.6.999.999 + 0.7 diff --git a/docs/reference/as.data.table.disk.frame.html b/docs/reference/as.data.table.disk.frame.html index 0d3427a..0241608 100644 --- a/docs/reference/as.data.table.disk.frame.html +++ b/docs/reference/as.data.table.disk.frame.html @@ -17,7 +17,7 @@ disk.frame - 0.6.999.999 + 0.7 diff --git a/docs/reference/as.disk.frame.html b/docs/reference/as.disk.frame.html index 86693e5..655a13b 100644 --- a/docs/reference/as.disk.frame.html +++ b/docs/reference/as.disk.frame.html @@ -17,7 +17,7 @@ disk.frame - 0.6.999.999 + 0.7 diff --git a/docs/reference/bind_rows.disk.frame.html b/docs/reference/bind_rows.disk.frame.html index 0143bc9..0f02824 100644 --- a/docs/reference/bind_rows.disk.frame.html +++ b/docs/reference/bind_rows.disk.frame.html @@ -17,7 +17,7 @@ disk.frame - 0.6.999.999 + 0.7 diff --git a/docs/reference/chunk_group_by.html b/docs/reference/chunk_group_by.html index f596c04..52940b9 100644 --- a/docs/reference/chunk_group_by.html +++ b/docs/reference/chunk_group_by.html @@ -34,7 +34,7 @@ disk.frame - 0.6.999.999 + 0.7 diff --git a/docs/reference/cmap.html b/docs/reference/cmap.html index 0161dca..5909c5a 100644 --- a/docs/reference/cmap.html +++ b/docs/reference/cmap.html @@ -21,7 +21,7 @@ disk.frame - 0.6.999.999 + 0.7 @@ -257,7 +257,7 @@

Examples

# return the first row of each chunk eagerly as list cmap(cars.df, ~.x[1,], lazy = FALSE) -#> path: "C:\Users\RTX2080\AppData\Local\Temp\Rtmp2lCstP\file1710252959dd.df" +#> path: "C:\Users\RTX2080\AppData\Local\Temp\RtmpUX5lOH\file1c385263150e.df" #> nchunks: 6 #> nrow (at source): 50 #> ncol (at source): 2 diff --git a/docs/reference/cmap2.html b/docs/reference/cmap2.html index 49aa645..5a9bf03 100644 --- a/docs/reference/cmap2.html +++ b/docs/reference/cmap2.html @@ -18,7 +18,7 @@ disk.frame - 0.6.999.999 + 0.7 diff --git a/docs/reference/collect.html b/docs/reference/collect.html index ab12f85..358f66f 100644 --- a/docs/reference/collect.html +++ b/docs/reference/collect.html @@ -20,7 +20,7 @@ disk.frame - 0.6.999.999 + 0.7 diff --git a/docs/reference/colnames.html b/docs/reference/colnames.html index 637ba39..0956e34 100644 --- a/docs/reference/colnames.html +++ b/docs/reference/colnames.html @@ -20,7 +20,7 @@ disk.frame - 0.6.999.999 + 0.7 diff --git a/docs/reference/compute.disk.frame.html b/docs/reference/compute.disk.frame.html index 3896951..bf8e3c8 100644 --- a/docs/reference/compute.disk.frame.html +++ b/docs/reference/compute.disk.frame.html @@ -17,7 +17,7 @@ disk.frame - 0.6.999.999 + 0.7 diff --git a/docs/reference/create_chunk_mapper.html b/docs/reference/create_chunk_mapper.html index a3bca37..3a42580 100644 --- a/docs/reference/create_chunk_mapper.html +++ b/docs/reference/create_chunk_mapper.html @@ -17,7 +17,7 @@ disk.frame - 0.6.999.999 + 0.7 diff --git a/docs/reference/csv_to_disk.frame.html b/docs/reference/csv_to_disk.frame.html index 7c88d8c..7e59e58 100644 --- a/docs/reference/csv_to_disk.frame.html +++ b/docs/reference/csv_to_disk.frame.html @@ -17,7 +17,7 @@ disk.frame - 0.6.999.999 + 0.7 diff --git a/docs/reference/delete.html b/docs/reference/delete.html index c36e9bc..5ac2c8d 100644 --- a/docs/reference/delete.html +++ b/docs/reference/delete.html @@ -17,7 +17,7 @@ disk.frame - 0.6.999.999 + 0.7 diff --git a/docs/reference/df_ram_size.html b/docs/reference/df_ram_size.html index 6c636c9..898899a 100644 --- a/docs/reference/df_ram_size.html +++ b/docs/reference/df_ram_size.html @@ -17,7 +17,7 @@ disk.frame - 0.6.999.999 + 0.7 diff --git a/docs/reference/dfglm.html b/docs/reference/dfglm.html index da652dc..b6b3dd1 100644 --- a/docs/reference/dfglm.html +++ b/docs/reference/dfglm.html @@ -18,7 +18,7 @@ disk.frame - 0.6.999.999 + 0.7 diff --git a/docs/reference/disk.frame.html b/docs/reference/disk.frame.html index 2912acf..e4c01f9 100644 --- a/docs/reference/disk.frame.html +++ b/docs/reference/disk.frame.html @@ -17,7 +17,7 @@ disk.frame - 0.6.999.999 + 0.7 @@ -111,7 +111,7 @@

Arguments

Examples

path = file.path(tempdir(),"cars")
 as.disk.frame(cars, outdir=path, overwrite = TRUE, nchunks = 2)
-#> path: "C:\Users\RTX2080\AppData\Local\Temp\Rtmp2lCstP/cars"
+#> path: "C:\Users\RTX2080\AppData\Local\Temp\RtmpUX5lOH/cars"
 #> nchunks: 2
 #> nrow (at source): 50
 #> ncol (at source): 2
diff --git a/docs/reference/dplyr_verbs.html b/docs/reference/dplyr_verbs.html
index 755512a..ff377e4 100644
--- a/docs/reference/dplyr_verbs.html
+++ b/docs/reference/dplyr_verbs.html
@@ -18,7 +18,7 @@
       
       
         disk.frame
-        0.6.999.999
+        0.7
       
     
diff --git a/docs/reference/evalparseglue.html b/docs/reference/evalparseglue.html index b2fd58c..78e280d 100644 --- a/docs/reference/evalparseglue.html +++ b/docs/reference/evalparseglue.html @@ -17,7 +17,7 @@ disk.frame - 0.6.999.999 + 0.7 diff --git a/docs/reference/find_globals_recursively.html b/docs/reference/find_globals_recursively.html index 094f26d..b19e8a8 100644 --- a/docs/reference/find_globals_recursively.html +++ b/docs/reference/find_globals_recursively.html @@ -17,7 +17,7 @@ disk.frame - 0.6.999.999 + 0.7 diff --git a/docs/reference/foverlaps.disk.frame.html b/docs/reference/foverlaps.disk.frame.html index fa5ddd0..383c9d0 100644 --- a/docs/reference/foverlaps.disk.frame.html +++ b/docs/reference/foverlaps.disk.frame.html @@ -17,7 +17,7 @@ disk.frame - 0.6.999.999 + 0.7 diff --git a/docs/reference/gen_datatable_synthetic.html b/docs/reference/gen_datatable_synthetic.html index e194526..f890a9a 100644 --- a/docs/reference/gen_datatable_synthetic.html +++ b/docs/reference/gen_datatable_synthetic.html @@ -17,7 +17,7 @@ disk.frame - 0.6.999.999 + 0.7 diff --git a/docs/reference/get_chunk.html b/docs/reference/get_chunk.html index d41a992..a7e53e5 100644 --- a/docs/reference/get_chunk.html +++ b/docs/reference/get_chunk.html @@ -17,7 +17,7 @@ disk.frame - 0.6.999.999 + 0.7 diff --git a/docs/reference/get_chunk_ids.html b/docs/reference/get_chunk_ids.html index 01e4fcb..ac2822d 100644 --- a/docs/reference/get_chunk_ids.html +++ b/docs/reference/get_chunk_ids.html @@ -17,7 +17,7 @@ disk.frame - 0.6.999.999 + 0.7 @@ -122,12 +122,12 @@

Examples

# return the file name chunk IDs get_chunk_ids(cars.df, full.names = TRUE) -#> [1] "C:\\Users\\RTX2080\\AppData\\Local\\Temp\\Rtmp2lCstP\\file17106b247014.df/1.fst" -#> [2] "C:\\Users\\RTX2080\\AppData\\Local\\Temp\\Rtmp2lCstP\\file17106b247014.df/2.fst" -#> [3] "C:\\Users\\RTX2080\\AppData\\Local\\Temp\\Rtmp2lCstP\\file17106b247014.df/3.fst" -#> [4] "C:\\Users\\RTX2080\\AppData\\Local\\Temp\\Rtmp2lCstP\\file17106b247014.df/4.fst" -#> [5] "C:\\Users\\RTX2080\\AppData\\Local\\Temp\\Rtmp2lCstP\\file17106b247014.df/5.fst" -#> [6] "C:\\Users\\RTX2080\\AppData\\Local\\Temp\\Rtmp2lCstP\\file17106b247014.df/6.fst" +#> [1] "C:\\Users\\RTX2080\\AppData\\Local\\Temp\\RtmpUX5lOH\\file1c385fab1acf.df/1.fst" +#> [2] "C:\\Users\\RTX2080\\AppData\\Local\\Temp\\RtmpUX5lOH\\file1c385fab1acf.df/2.fst" +#> [3] "C:\\Users\\RTX2080\\AppData\\Local\\Temp\\RtmpUX5lOH\\file1c385fab1acf.df/3.fst" +#> [4] "C:\\Users\\RTX2080\\AppData\\Local\\Temp\\RtmpUX5lOH\\file1c385fab1acf.df/4.fst" +#> [5] "C:\\Users\\RTX2080\\AppData\\Local\\Temp\\RtmpUX5lOH\\file1c385fab1acf.df/5.fst" +#> [6] "C:\\Users\\RTX2080\\AppData\\Local\\Temp\\RtmpUX5lOH\\file1c385fab1acf.df/6.fst" # return the file name chunk IDs with file extension get_chunk_ids(cars.df, strip_extension = FALSE) diff --git a/docs/reference/get_partition_paths.html b/docs/reference/get_partition_paths.html index 8ff4d44..7dff33c 100644 --- a/docs/reference/get_partition_paths.html +++ b/docs/reference/get_partition_paths.html @@ -17,7 +17,7 @@ disk.frame - 0.6.999.999 + 0.7 @@ -102,7 +102,7 @@

Get the partitioning structure of a folder

Arguments

df
-

a disk.frame whose paths will be used to determine if it' s +

a disk.frame whose paths will be used to determine if it's folder-partitioned disk.frame

diff --git a/docs/reference/group_by.html b/docs/reference/group_by.html index f32ba71..2a026cd 100644 --- a/docs/reference/group_by.html +++ b/docs/reference/group_by.html @@ -20,7 +20,7 @@ disk.frame - 0.6.999.999 + 0.7 diff --git a/docs/reference/groups.disk.frame.html b/docs/reference/groups.disk.frame.html index e490ad5..d1c2b15 100644 --- a/docs/reference/groups.disk.frame.html +++ b/docs/reference/groups.disk.frame.html @@ -17,7 +17,7 @@ disk.frame - 0.6.999.999 + 0.7 diff --git a/docs/reference/head_tail.html b/docs/reference/head_tail.html index 32de0af..50bd1d5 100644 --- a/docs/reference/head_tail.html +++ b/docs/reference/head_tail.html @@ -17,7 +17,7 @@ disk.frame - 0.6.999.999 + 0.7 diff --git a/docs/reference/index.html b/docs/reference/index.html index 6900fdb..264deb0 100644 --- a/docs/reference/index.html +++ b/docs/reference/index.html @@ -17,7 +17,7 @@ disk.frame - 0.6.999.999 + 0.7 diff --git a/docs/reference/is_disk.frame.html b/docs/reference/is_disk.frame.html index a533983..0a09221 100644 --- a/docs/reference/is_disk.frame.html +++ b/docs/reference/is_disk.frame.html @@ -17,7 +17,7 @@ disk.frame - 0.6.999.999 + 0.7 diff --git a/docs/reference/join.html b/docs/reference/join.html index b529a90..66d1b40 100644 --- a/docs/reference/join.html +++ b/docs/reference/join.html @@ -17,7 +17,7 @@ disk.frame - 0.6.999.999 + 0.7 diff --git a/docs/reference/make_glm_streaming_fn.html b/docs/reference/make_glm_streaming_fn.html index 7da36ad..a9e7818 100644 --- a/docs/reference/make_glm_streaming_fn.html +++ b/docs/reference/make_glm_streaming_fn.html @@ -17,7 +17,7 @@ disk.frame - 0.6.999.999 + 0.7 diff --git a/docs/reference/merge.disk.frame.html b/docs/reference/merge.disk.frame.html index 2d9a833..08f3ff9 100644 --- a/docs/reference/merge.disk.frame.html +++ b/docs/reference/merge.disk.frame.html @@ -17,7 +17,7 @@ disk.frame - 0.6.999.999 + 0.7 diff --git a/docs/reference/move_to.html b/docs/reference/move_to.html index c65a953..b28af7b 100644 --- a/docs/reference/move_to.html +++ b/docs/reference/move_to.html @@ -17,7 +17,7 @@ disk.frame - 0.6.999.999 + 0.7 diff --git a/docs/reference/nchunks.html b/docs/reference/nchunks.html index 38f5361..f9a52a0 100644 --- a/docs/reference/nchunks.html +++ b/docs/reference/nchunks.html @@ -17,7 +17,7 @@ disk.frame - 0.6.999.999 + 0.7 diff --git a/docs/reference/ncol_nrow.html b/docs/reference/ncol_nrow.html index d509b27..b5d845b 100644 --- a/docs/reference/ncol_nrow.html +++ b/docs/reference/ncol_nrow.html @@ -17,7 +17,7 @@ disk.frame - 0.6.999.999 + 0.7 diff --git a/docs/reference/one-stage-group-by-verbs.html b/docs/reference/one-stage-group-by-verbs.html index 5673efc..bed4cc3 100644 --- a/docs/reference/one-stage-group-by-verbs.html +++ b/docs/reference/one-stage-group-by-verbs.html @@ -19,7 +19,7 @@ disk.frame - 0.6.999.999 + 0.7 diff --git a/docs/reference/overwrite_check.html b/docs/reference/overwrite_check.html index 5edf126..67a2b15 100644 --- a/docs/reference/overwrite_check.html +++ b/docs/reference/overwrite_check.html @@ -17,7 +17,7 @@ disk.frame - 0.6.999.999 + 0.7 diff --git a/docs/reference/partition_filter.html b/docs/reference/partition_filter.html index 95ce4d0..14d6233 100644 --- a/docs/reference/partition_filter.html +++ b/docs/reference/partition_filter.html @@ -17,7 +17,7 @@ disk.frame - 0.6.999.999 + 0.7 diff --git a/docs/reference/play.html b/docs/reference/play.html index 326a0a2..1fd2f78 100644 --- a/docs/reference/play.html +++ b/docs/reference/play.html @@ -17,7 +17,7 @@ disk.frame - 0.6.999.999 + 0.7 diff --git a/docs/reference/print.disk.frame.html b/docs/reference/print.disk.frame.html index 9e5b9bb..d39140b 100644 --- a/docs/reference/print.disk.frame.html +++ b/docs/reference/print.disk.frame.html @@ -17,7 +17,7 @@ disk.frame - 0.6.999.999 + 0.7 diff --git a/docs/reference/pull.disk.frame.html b/docs/reference/pull.disk.frame.html index bd63b61..41b02d1 100644 --- a/docs/reference/pull.disk.frame.html +++ b/docs/reference/pull.disk.frame.html @@ -17,7 +17,7 @@ disk.frame - 0.6.999.999 + 0.7 diff --git a/docs/reference/purrr_as_mapper.html b/docs/reference/purrr_as_mapper.html index ee2f372..5a118b6 100644 --- a/docs/reference/purrr_as_mapper.html +++ b/docs/reference/purrr_as_mapper.html @@ -17,7 +17,7 @@ disk.frame - 0.6.999.999 + 0.7 diff --git a/docs/reference/rbindlist.disk.frame.html b/docs/reference/rbindlist.disk.frame.html index b8ce4aa..4ca1ab3 100644 --- a/docs/reference/rbindlist.disk.frame.html +++ b/docs/reference/rbindlist.disk.frame.html @@ -17,7 +17,7 @@ disk.frame - 0.6.999.999 + 0.7 diff --git a/docs/reference/rechunk.html b/docs/reference/rechunk.html index 53a6ae6..1e7fc40 100644 --- a/docs/reference/rechunk.html +++ b/docs/reference/rechunk.html @@ -17,7 +17,7 @@ disk.frame - 0.6.999.999 + 0.7 @@ -126,8 +126,8 @@

Examples

# re-chunking cars.df to 3 chunks, done "in-place" to the same folder as cars.df rechunk(cars.df, 3) -#> files have been backed up to temporary dir C:\Users\RTX2080\AppData\Local\Temp\Rtmp2lCstP\back_up_tmp_dir1710762812d3. You can recover there files until you restart your R session -#> path: "C:\Users\RTX2080\AppData\Local\Temp\Rtmp2lCstP\file17106dc327d2.df" +#> files have been backed up to temporary dir C:\Users\RTX2080\AppData\Local\Temp\RtmpUX5lOH\back_up_tmp_dir1c382d2e21d6. You can recover there files until you restart your R session +#> path: "C:\Users\RTX2080\AppData\Local\Temp\RtmpUX5lOH\file1c3824aa62c4.df" #> nchunks: 3 #> nrow (at source): 50 #> ncol (at source): 2 diff --git a/docs/reference/recommend_nchunks.html b/docs/reference/recommend_nchunks.html index 11b63d0..f1a9a12 100644 --- a/docs/reference/recommend_nchunks.html +++ b/docs/reference/recommend_nchunks.html @@ -18,7 +18,7 @@ disk.frame - 0.6.999.999 + 0.7 diff --git a/docs/reference/remove_chunk.html b/docs/reference/remove_chunk.html index b7e8f1e..8fbb459 100644 --- a/docs/reference/remove_chunk.html +++ b/docs/reference/remove_chunk.html @@ -17,7 +17,7 @@ disk.frame - 0.6.999.999 + 0.7 @@ -116,7 +116,7 @@

Examples

# removes 3rd chunk remove_chunk(cars.df, 3) -#> path: "C:\Users\RTX2080\AppData\Local\Temp\Rtmp2lCstP\file171025e93bdf.df" +#> path: "C:\Users\RTX2080\AppData\Local\Temp\RtmpUX5lOH\file1c3821d32360.df" #> nchunks: 3 #> nrow (at source): 37 #> ncol (at source): 2 @@ -125,7 +125,7 @@

Examples

# removes 4th chunk remove_chunk(cars.df, "4.fst") -#> path: "C:\Users\RTX2080\AppData\Local\Temp\Rtmp2lCstP\file171025e93bdf.df" +#> path: "C:\Users\RTX2080\AppData\Local\Temp\RtmpUX5lOH\file1c3821d32360.df" #> nchunks: 2 #> nrow (at source): 26 #> ncol (at source): 2 @@ -134,7 +134,7 @@

Examples

# removes 2nd chunk remove_chunk(cars.df, file.path(attr(cars.df, "path", exact=TRUE), "2.fst"), full.names = TRUE) -#> path: "C:\Users\RTX2080\AppData\Local\Temp\Rtmp2lCstP\file171025e93bdf.df" +#> path: "C:\Users\RTX2080\AppData\Local\Temp\RtmpUX5lOH\file1c3821d32360.df" #> nchunks: 1 #> nrow (at source): 13 #> ncol (at source): 2 diff --git a/docs/reference/sample.html b/docs/reference/sample.html index 82e1e23..45c890a 100644 --- a/docs/reference/sample.html +++ b/docs/reference/sample.html @@ -17,7 +17,7 @@ disk.frame - 0.6.999.999 + 0.7 @@ -127,27 +127,27 @@

Examples

collect(sample_frac(cars.df, 0.5)) #> speed dist -#> 1: 4 10 -#> 2: 7 4 +#> 1: 7 22 +#> 2: 10 18 #> 3: 9 10 -#> 4: 10 18 +#> 4: 8 16 #> 5: 11 17 -#> 6: 12 28 -#> 7: 12 20 +#> 6: 13 34 +#> 7: 13 34 #> 8: 12 14 -#> 9: 16 32 -#> 10: 14 60 -#> 11: 15 20 -#> 12: 14 26 -#> 13: 17 32 +#> 9: 14 80 +#> 10: 15 54 +#> 11: 14 36 +#> 12: 16 32 +#> 13: 17 50 #> 14: 18 42 -#> 15: 17 40 -#> 16: 18 56 -#> 17: 19 68 +#> 15: 18 56 +#> 16: 18 84 +#> 17: 20 52 #> 18: 20 32 -#> 19: 20 56 +#> 19: 20 64 #> 20: 19 46 -#> 21: 24 120 +#> 21: 24 92 #> 22: 24 70 #> speed dist diff --git a/docs/reference/setup_disk.frame.html b/docs/reference/setup_disk.frame.html index 4e6a3d8..a299498 100644 --- a/docs/reference/setup_disk.frame.html +++ b/docs/reference/setup_disk.frame.html @@ -17,7 +17,7 @@ disk.frame - 0.6.999.999 + 0.7 diff --git a/docs/reference/shard.html b/docs/reference/shard.html index 5d4cfbb..f419f59 100644 --- a/docs/reference/shard.html +++ b/docs/reference/shard.html @@ -18,7 +18,7 @@ disk.frame - 0.6.999.999 + 0.7 diff --git a/docs/reference/shardkey.html b/docs/reference/shardkey.html index a66f3df..f455323 100644 --- a/docs/reference/shardkey.html +++ b/docs/reference/shardkey.html @@ -17,7 +17,7 @@ disk.frame - 0.6.999.999 + 0.7 diff --git a/docs/reference/shardkey_equal.html b/docs/reference/shardkey_equal.html index 46a45da..7b013cd 100644 --- a/docs/reference/shardkey_equal.html +++ b/docs/reference/shardkey_equal.html @@ -17,7 +17,7 @@ disk.frame - 0.6.999.999 + 0.7 diff --git a/docs/reference/show_ceremony.html b/docs/reference/show_ceremony.html index 3686247..8df3ffd 100644 --- a/docs/reference/show_ceremony.html +++ b/docs/reference/show_ceremony.html @@ -17,7 +17,7 @@ disk.frame - 0.6.999.999 + 0.7 diff --git a/docs/reference/split_string_into_df.html b/docs/reference/split_string_into_df.html index 9e33093..9a65c61 100644 --- a/docs/reference/split_string_into_df.html +++ b/docs/reference/split_string_into_df.html @@ -17,7 +17,7 @@ disk.frame - 0.6.999.999 + 0.7 @@ -99,6 +99,11 @@

Turn a string of the form /partion1=val/partion2=val2 into data.frame

split_string_into_df(path_strs)
+
+

Arguments

+
path_strs
+

The paths in string form to break into partition format

+
diff --git a/docs/reference/sub-.disk.frame.html b/docs/reference/sub-.disk.frame.html index 231a4d1..9023b0d 100644 --- a/docs/reference/sub-.disk.frame.html +++ b/docs/reference/sub-.disk.frame.html @@ -17,7 +17,7 @@ disk.frame - 0.6.999.999 + 0.7 diff --git a/docs/reference/tbl_vars.disk.frame.html b/docs/reference/tbl_vars.disk.frame.html index d5d64d0..6a51358 100644 --- a/docs/reference/tbl_vars.disk.frame.html +++ b/docs/reference/tbl_vars.disk.frame.html @@ -18,7 +18,7 @@ disk.frame - 0.6.999.999 + 0.7 diff --git a/docs/reference/write_disk.frame.html b/docs/reference/write_disk.frame.html index 187568d..b57289c 100644 --- a/docs/reference/write_disk.frame.html +++ b/docs/reference/write_disk.frame.html @@ -18,7 +18,7 @@ disk.frame - 0.6.999.999 + 0.7 diff --git a/docs/reference/zip_to_disk.frame.html b/docs/reference/zip_to_disk.frame.html index 73b3980..9a170ce 100644 --- a/docs/reference/zip_to_disk.frame.html +++ b/docs/reference/zip_to_disk.frame.html @@ -20,7 +20,7 @@ disk.frame - 0.6.999.999 + 0.7 @@ -144,7 +144,7 @@

Examples

# read every file and convert it to a disk.frame zip.df = zip_to_disk.frame(zipfile, tempfile(fileext = ".df")) -#> Error in unzip(zipfile, list = TRUE): zip file 'C:\Users\RTX2080\AppData\Local\Temp\Rtmp2lCstP\file171021f94c27.zip' cannot be opened +#> Error in unzip(zipfile, list = TRUE): zip file 'C:\Users\RTX2080\AppData\Local\Temp\RtmpUX5lOH\file1c3872483ca5.zip' cannot be opened # there is only one csv file so it return a list of one disk.frame zip.df[[1]] diff --git a/docs/sitemap.xml b/docs/sitemap.xml index cedcfdd..b50973e 100644 --- a/docs/sitemap.xml +++ b/docs/sitemap.xml @@ -186,6 +186,9 @@ /reference/get_chunk_ids.html + + /reference/get_partition_paths.html + /reference/groups.disk.frame.html @@ -243,6 +246,9 @@ /reference/overwrite_check.html + + /reference/partition_filter.html + /reference/play.html @@ -285,6 +291,9 @@ /reference/show_ceremony.html + + /reference/split_string_into_df.html + /reference/srckeep.html From 4cedfe397b517fad4ac00bfb039e001dd1c8f76c Mon Sep 17 00:00:00 2001 From: ZJ Dai Date: Sat, 5 Feb 2022 22:44:13 +1100 Subject: [PATCH 14/16] added a conversion --- NAMESPACE | 2 ++ R/disk.frame-to-parquet.r | 33 +++++++++++++++++++++++++++++++++ R/zzz.r | 33 ++++++++++++++++++++++++++++----- man/disk.frame_to_parquet.Rd | 11 +++++++++++ 4 files changed, 74 insertions(+), 5 deletions(-) create mode 100644 R/disk.frame-to-parquet.r create mode 100644 man/disk.frame_to_parquet.Rd diff --git a/NAMESPACE b/NAMESPACE index 388b3d4..03a29be 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -82,6 +82,7 @@ export(delete) export(df_ram_size) export(dfglm) export(disk.frame) +export(disk.frame_to_parquet) export(distribute) export(evalparseglue) export(foverlaps.disk.frame) @@ -139,6 +140,7 @@ import(fs) import(fst) import(stringr) importFrom(Rcpp,evalCpp) +importFrom(arrow,write_parquet) importFrom(benchmarkme,get_ram) importFrom(bigreadr,get_split_files) importFrom(bigreadr,split_file) diff --git a/R/disk.frame-to-parquet.r b/R/disk.frame-to-parquet.r new file mode 100644 index 0000000..e2f9238 --- /dev/null +++ b/R/disk.frame-to-parquet.r @@ -0,0 +1,33 @@ +#' A function to convert a disk.frame to parquet format +#' @importFrom arrow write_parquet +#' @export +disk.frame_to_parquet <- function(df, outdir) { + if("disk.frame" %in% class(df)) { + path = attr(df, "path") + } else { + path = df + } + + path = normalizePath(path) + fst_files = normalizePath(list.files(path, "fst$", full.names = TRUE, recursive=TRUE)) + + if(!fs::dir_exists(outdir)) { + fs::dir_create(outdir) + } + + + future.apply::future_lapply(fst_files, function(fst_file) { + file_name = basename(fst_file) + file_name = paste0(stringr::str_sub(file_name, 1, nchar(file_name)-4), ".parquet") + path_name = normalizePath(dirname(fst_file)) + + # remove base directory from path + path_name = stringr::str_sub(path_name, nchar(path)+1) + + if(!fs::dir_exists(file.path(outdir, path_name))) { + fs::dir_create(file.path(outdir, path_name)) + } + + arrow::write_parquet(fst::read_fst(fst_file), file.path(outdir, path_name, file_name)) + }) +} \ No newline at end of file diff --git a/R/zzz.r b/R/zzz.r index 077b1cf..9dce00b 100644 --- a/R/zzz.r +++ b/R/zzz.r @@ -8,12 +8,34 @@ #setup_disk.frame() packageStartupMessage( - crayon::red( + crayon::blue( +"\n\n Thank you for using {disk.frame}. However {disk.frame} has been soft-deprecated and I recommend users to swith over to using the {arrow} package for handling larger-than-RAM data. You can convert your existing disk.frames to the parquet format which {arrow} can use by using:\n\n +``` +disk.frame::disk.frame_to_parquet(path.to.your.disk.frame, parquet_path) +```` + +Once done you can use {arrow}'s dataset features to manipulate the larger-than-RAM data using dplyr verbs. E.g. + +``` +dataset = arrow::open_dataset(parquet_path) + +parquet_path |> + mutate(...) |> + group_by(...) |> + summarize(...) |> + collect(...) +``` +" + ), + crayon::red( glue::glue( - "\n\n## Message from disk.frame: + "\n\n## Message from disk.frame: We have {future::nbrOfWorkers()} workers to use with disk.frame. -To change that, use setup_disk.frame(workers = n) or just setup_disk.frame() to use the defaults.")), - crayon::green("\n\n +To change that, use setup_disk.frame(workers = n) or just setup_disk.frame() to use the defaults." + ) + ), + crayon::green( + "\n\n It is recommended that you run the following immediately to set up disk.frame with multiple workers in order to parallelize your operations:\n\n ```r # this will set up disk.frame with multiple workers @@ -21,7 +43,8 @@ setup_disk.frame() # this will allow unlimited amount of data to be passed from worker to worker options(future.globals.maxSize = Inf) ``` -\n\n")) +\n\n" + )) } globalVariables(c( diff --git a/man/disk.frame_to_parquet.Rd b/man/disk.frame_to_parquet.Rd new file mode 100644 index 0000000..0c4d8db --- /dev/null +++ b/man/disk.frame_to_parquet.Rd @@ -0,0 +1,11 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/disk.frame-to-parquet.r +\name{disk.frame_to_parquet} +\alias{disk.frame_to_parquet} +\title{A function to convert a disk.frame to parquet format} +\usage{ +disk.frame_to_parquet(df, outdir) +} +\description{ +A function to convert a disk.frame to parquet format +} From fd60fd3f6f326cbcacf5aeccb5c370e0548e1d34 Mon Sep 17 00:00:00 2001 From: ZJ Dai Date: Mon, 7 Feb 2022 16:16:08 +1100 Subject: [PATCH 15/16] ready for submission --- DESCRIPTION | 3 ++- R/disk.frame-to-parquet.r | 2 ++ R/zzz.r | 2 -- man/disk.frame_to_parquet.Rd | 5 +++++ utils/build_utils.R | 18 ------------------ 5 files changed, 9 insertions(+), 21 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 3c33aa0..77f0937 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -31,7 +31,8 @@ Imports: benchmarkme, purrr (>= 0.3.2), globals, - rlang + rlang, + arrow Depends: R (>= 4.0), dplyr (>= 1.0.0) diff --git a/R/disk.frame-to-parquet.r b/R/disk.frame-to-parquet.r index e2f9238..d9ec2e4 100644 --- a/R/disk.frame-to-parquet.r +++ b/R/disk.frame-to-parquet.r @@ -1,6 +1,8 @@ #' A function to convert a disk.frame to parquet format #' @importFrom arrow write_parquet #' @export +#' @param df a disk.frame or a path to a disk.frame +#' @param outdir the path to save the parquet files disk.frame_to_parquet <- function(df, outdir) { if("disk.frame" %in% class(df)) { path = attr(df, "path") diff --git a/R/zzz.r b/R/zzz.r index 9dce00b..e486cd0 100644 --- a/R/zzz.r +++ b/R/zzz.r @@ -5,8 +5,6 @@ #' @importFrom future nbrOfWorkers #' @importFrom crayon red blue green .onAttach <- function(libname, pkgname) { - #setup_disk.frame() - packageStartupMessage( crayon::blue( "\n\n Thank you for using {disk.frame}. However {disk.frame} has been soft-deprecated and I recommend users to swith over to using the {arrow} package for handling larger-than-RAM data. You can convert your existing disk.frames to the parquet format which {arrow} can use by using:\n\n diff --git a/man/disk.frame_to_parquet.Rd b/man/disk.frame_to_parquet.Rd index 0c4d8db..559a9a7 100644 --- a/man/disk.frame_to_parquet.Rd +++ b/man/disk.frame_to_parquet.Rd @@ -6,6 +6,11 @@ \usage{ disk.frame_to_parquet(df, outdir) } +\arguments{ +\item{df}{a disk.frame or a path to a disk.frame} + +\item{outdir}{the path to save the parquet files} +} \description{ A function to convert a disk.frame to parquet format } diff --git a/utils/build_utils.R b/utils/build_utils.R index 3c008d9..4e95559 100644 --- a/utils/build_utils.R +++ b/utils/build_utils.R @@ -1,21 +1,3 @@ -# df_bookdown_build <- function() { -# rmds = list.files("vignettes", pattern = "*.Rmd") -# sapply(rmds, function(file) { -# fs::file_copy( -# file.path("book", file), -# file.path("vignettes", file), -# overwrite = TRUE -# ) -# }) -# if(fs::dir_exists("book/_bookdown_files")) { -# fs::dir_delete("book/_bookdown_files") -# } -# while(fs::dir_exists("book/_bookdown_files")) { -# Sys.sleep(1) -# } -# rmarkdown::render_site("book", encoding = 'UTF-8') -# } - df_build_site <- function() { df_build_readme() df_setup_vignette() From 9b44fdd49fff03b1afa574dcf77d4d386f8f11ec Mon Sep 17 00:00:00 2001 From: ZJ Dai Date: Mon, 7 Mar 2022 22:11:38 +1100 Subject: [PATCH 16/16] fixes #84 --- DESCRIPTION | 6 +++--- NEWS.md | 7 +++++++ R/collect.summarized_disk.frame.r | 18 +++++++++++++++--- R/one-stage-verbs.R | 2 +- R/zzz.r | 2 +- 5 files changed, 27 insertions(+), 8 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 77f0937..51eedc8 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,8 +1,8 @@ Type: Package Package: disk.frame Title: Larger-than-RAM Disk-Based Data Manipulation Framework -Version: 0.7 -Date: 2022-02-015 +Version: 0.7.2 +Date: 2022-03-07 Authors@R: c( person("Dai", "ZJ", email = "zhuojia.dai@gmail.com", role = c("aut", "cre")), person("Jacky", "Poon", role = c("ctb")) @@ -53,4 +53,4 @@ LinkingTo: RoxygenNote: 7.1.2 Encoding: UTF-8 URL: https://diskframe.com -BugReports: https://github.com/xiaodaigh/disk.frame/issues +BugReports: https://github.com/DiskFrame/disk.frame/issues diff --git a/NEWS.md b/NEWS.md index db7468a..501a74a 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,10 @@ +# disk.frame 0.7.2 +* Bugfixes Github 384 + +# disk.frame 0.7.1 +* Bugfixes +* And ability to convert to Parquet + # disk.frame 0.7 * Partitioned by folder * Updated R version to 4 diff --git a/R/collect.summarized_disk.frame.r b/R/collect.summarized_disk.frame.r index cff94ff..58c7a50 100644 --- a/R/collect.summarized_disk.frame.r +++ b/R/collect.summarized_disk.frame.r @@ -29,7 +29,7 @@ collect.summarized_disk.frame <- dotdotdot <- attr(x, 'summarize_code') group_by_vars = attr(x, "group_by_cols") - # look at the group by and summarise codes and figure out which columns need to be + # look at the group by and summaries codes and figure out which columns need to be # srckeep df_to_find_cols = fst::read_fst(get_chunk_ids(x, full.names = TRUE)[1], from=1, to=1) @@ -41,9 +41,21 @@ collect.summarized_disk.frame <- globals::findGlobals(one, envir = list2env(df_to_find_cols, parent=globalenv())) }) %>% unlist %>% unique - src_keep_cols = intersect(names(df_to_find_cols), c(cols_in_summ, cols_in_group_by) %>% unique) + cols_used = c(cols_in_summ, cols_in_group_by) %>% unique + src_keep_cols = intersect(names(df_to_find_cols), cols_used) + + # are there any variables used in the group by or summarise that is not present in the original data? + # if yes then that indicates this could be more complicated e.g. a new var was created with mutate + extra_vars = setdiff(cols_used, names(df_to_find_cols)) + if(length(extra_vars) > 0) { + warning(sprintf( + "These columns that appear in the group-by and summarise does not appear in the original data set: %s. This set of action is too hard for disk.frame to figure out the `srckeep` automatically, you must do the `srckeep` manually." + , paste0(extra_vars, collapse = ", "))) + } else { + x = srckeep(x, src_keep_cols) + } + - x = srckeep(x, src_keep_cols) # make a copy dotdotdot_chunk_agg <- dotdotdot diff --git a/R/one-stage-verbs.R b/R/one-stage-verbs.R index b9fabba..39d9b35 100644 --- a/R/one-stage-verbs.R +++ b/R/one-stage-verbs.R @@ -260,7 +260,7 @@ summarize.grouped_disk.frame = summarise.grouped_disk.frame group_by.disk.frame <- function(.data, ..., .add = FALSE, .drop = stop("disk.frame does not support `.drop` in `group_by` at this stage")) { class(.data) <- c("grouped_disk.frame", "disk.frame") - # using rlang is a neccesary evil here as I need to deal with !!! that is supported by group_by etc + # using rlang is a necessary evil here as I need to deal with !!! that is supported by group_by etc group_by_cols = rlang::enexprs(...) # convert any quosure to labels diff --git a/R/zzz.r b/R/zzz.r index e486cd0..02b5feb 100644 --- a/R/zzz.r +++ b/R/zzz.r @@ -7,7 +7,7 @@ .onAttach <- function(libname, pkgname) { packageStartupMessage( crayon::blue( -"\n\n Thank you for using {disk.frame}. However {disk.frame} has been soft-deprecated and I recommend users to swith over to using the {arrow} package for handling larger-than-RAM data. You can convert your existing disk.frames to the parquet format which {arrow} can use by using:\n\n +"\n\n Thank you for using {disk.frame}. However {disk.frame} has been soft-deprecated. You are recommended to switch over to using the {arrow} package for handling larger-than-RAM data. You can convert your existing disk.frames to the parquet format which {arrow} can use by using:\n\n ``` disk.frame::disk.frame_to_parquet(path.to.your.disk.frame, parquet_path) ````