diff --git a/DESCRIPTION b/DESCRIPTION index 1301c323..51eedc85 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,8 +1,8 @@ Type: Package Package: disk.frame Title: Larger-than-RAM Disk-Based Data Manipulation Framework -Version: 0.6.0 -Date: 2022-01-31 +Version: 0.7.2 +Date: 2022-03-07 Authors@R: c( person("Dai", "ZJ", email = "zhuojia.dai@gmail.com", role = c("aut", "cre")), person("Jacky", "Poon", role = c("ctb")) @@ -30,9 +30,11 @@ Imports: bit64, benchmarkme, purrr (>= 0.3.2), - rlang + globals, + rlang, + arrow Depends: - R (>= 3.4), + R (>= 4.0), dplyr (>= 1.0.0) Suggests: nycflights13, @@ -51,4 +53,4 @@ LinkingTo: RoxygenNote: 7.1.2 Encoding: UTF-8 URL: https://diskframe.com -BugReports: https://github.com/xiaodaigh/disk.frame/issues +BugReports: https://github.com/DiskFrame/disk.frame/issues diff --git a/NAMESPACE b/NAMESPACE index 2def4912..03a29bef 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -82,6 +82,7 @@ export(delete) export(df_ram_size) export(dfglm) export(disk.frame) +export(disk.frame_to_parquet) export(distribute) export(evalparseglue) export(foverlaps.disk.frame) @@ -113,6 +114,7 @@ export(nchunks) export(ncol) export(nrow) export(overwrite_check) +export(partition_filter) export(quantile_df.chunk_agg.disk.frame) export(quantile_df.collected_agg.disk.frame) export(rbindlist.disk.frame) @@ -138,6 +140,7 @@ import(fs) import(fst) import(stringr) importFrom(Rcpp,evalCpp) +importFrom(arrow,write_parquet) importFrom(benchmarkme,get_ram) importFrom(bigreadr,get_split_files) importFrom(bigreadr,split_file) @@ -165,6 +168,7 @@ importFrom(dplyr,full_join) importFrom(dplyr,glimpse) importFrom(dplyr,group_by) importFrom(dplyr,group_by_drop_default) +importFrom(dplyr,group_map) importFrom(dplyr,group_vars) importFrom(dplyr,groups) importFrom(dplyr,inner_join) @@ -195,6 +199,7 @@ importFrom(future,nbrOfWorkers) importFrom(future,plan) importFrom(future,sequential) importFrom(future.apply,future_lapply) +importFrom(globals,findGlobals) importFrom(glue,glue) importFrom(jsonlite,fromJSON) importFrom(jsonlite,toJSON) @@ -215,5 +220,6 @@ importFrom(utils,capture.output) importFrom(utils,head) importFrom(utils,memory.limit) importFrom(utils,tail) +importFrom(utils,type.convert) importFrom(utils,unzip) useDynLib(disk.frame) diff --git a/NEWS.md b/NEWS.md index e075c6da..501a74ab 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,19 @@ +# disk.frame 0.7.2 +* Bugfixes Github 384 + +# disk.frame 0.7.1 +* Bugfixes +* And ability to convert to Parquet + +# disk.frame 0.7 +* Partitioned by folder +* Updated R version to 4 + +# disk.frame 0.6.1 +* Fixed bug with data.table syntax +* Auto detection of srckeep in group by +* Global detection for group by and summarise + # disk.frame 0.6 * Much better NSE support in disk.frame! * removed `hard_arrange` and `hard_group_by` diff --git a/R/chunk_mapper.r b/R/chunk_mapper.r index 1b0d3ff7..dc6b0556 100644 --- a/R/chunk_mapper.r +++ b/R/chunk_mapper.r @@ -41,11 +41,36 @@ create_chunk_mapper <- function(chunk_fn, warning_msg = NULL, as.data.frame = FA if(!is.null(warning_msg)) { warning(warning_msg) } + dotdotdot = rlang::enexprs(...) + + # convert any quosure to labels + for (i in seq_along(dotdotdot)) { + dotdotdot[[i]] <- dotdotdot[[i]] %>% + rlang::quo_squash() + } + + args_str = mapply(function(name, val) { + rhs = deparse(val) %>% + paste0(collapse = "") + if(name != "") { + sprintf("%s=%s", name, rhs) + } else { + rhs + } + }, names(dotdotdot), dotdotdot) %>% + paste0(collapse = ", ") + + if (args_str == "") { + code = parse(text="chunk_fn(.disk.frame.chunk)")[[1]] + } else { + code = parse(text=sprintf("chunk_fn(.disk.frame.chunk, %s)", args_str))[[1]] + } + # you need to use list otherwise the names will be gone - code = substitute(chunk_fn(.disk.frame.chunk, ...)) + # code = substitute(chunk_fn(.disk.frame.chunk, ...)) - if (paste0(deparse(code), collapse="") == "chunk_fn(NULL)") { + if (paste0(deparse(code), collapse="") == "chunk_fn(.disk.frame.chunk, NULL)") { globals_and_pkgs = future::getGlobalsAndPackages(expression(chunk_fn())) } else { globals_and_pkgs = future::getGlobalsAndPackages(code) diff --git a/R/collect.r b/R/collect.r index cf5add15..98347581 100644 --- a/R/collect.r +++ b/R/collect.r @@ -25,14 +25,61 @@ #' @rdname collect collect.disk.frame <- function(x, ..., parallel = !is.null(attr(x,"recordings"))) { cids = get_chunk_ids(x, full.names = TRUE, strip_extension = FALSE) + # obtain filters from structure + partitioned_paths_or_not = get_partition_paths(x) + + if (partitioned_paths_or_not$is_partitioned) { + partitioned_paths = partitioned_paths_or_not$paths + if(length(partitioned_paths) >= 1) { + # filter the cids based on the paths + tmp = data.frame(paths = cids) %>% + mutate(dirname = dirname(paths)) %>% + inner_join(data.frame(dirname = sapply(partitioned_paths, tools::file_path_as_absolute)), by = "dirname") + cids = tmp$paths + } + } + if(nchunks(x) > 0) { if(parallel) { - tmp = future.apply::future_lapply(cids, function(.x) { - get_chunk(x, .x, full.names = TRUE) - }, future.seed = TRUE) + tmp<-future.apply::future_lapply(cids, function(.x, meh) { + if(partitioned_paths_or_not$is_partitioned) { + dirpath = dirname(.x) + tmp2 = partitioned_paths_or_not$df %>% + mutate(fullpath = file.path(attr(x, "path") %>% tools::file_path_as_absolute(), .disk.frame.sub.path)) + + tmp2a = tmp2 %>% + filter(fullpath == dirpath) %>% + mutate(.check=1) + + stopifnot(nrow(tmp2a) == 1) + + tmp3 = get_chunk.disk.frame(x, .x, full.names = TRUE, partitioned_info = tmp2a) + return(tmp3) + } else { + return(get_chunk.disk.frame(x, .x, full.names = TRUE)) + } + }, future.seed = NULL) return(rbindlist(tmp)) } else { - purrr::map_dfr(cids, ~get_chunk(x, .x, full.names = TRUE)) + tmp<-lapply(cids, function(.x, meh) { + if(partitioned_paths_or_not$is_partitioned) { + dirpath = dirname(.x) + tmp2 = partitioned_paths_or_not$df %>% + mutate(fullpath = file.path(attr(x, "path") %>% tools::file_path_as_absolute(), .disk.frame.sub.path)) + + tmp2a = tmp2 %>% + filter(fullpath == dirpath) %>% + mutate(.check=1) + + stopifnot(nrow(tmp2a) == 1) + + tmp3 = get_chunk.disk.frame(x, .x, full.names = TRUE, partitioned_info = tmp2a) + return(tmp3) + } else { + return(get_chunk.disk.frame(x, .x, full.names = TRUE)) + } + }) + return(rbindlist(tmp)) } } else { data.table() diff --git a/R/collect.summarized_disk.frame.r b/R/collect.summarized_disk.frame.r index edd686a4..58c7a500 100644 --- a/R/collect.summarized_disk.frame.r +++ b/R/collect.summarized_disk.frame.r @@ -13,6 +13,7 @@ #' @importFrom data.table data.table as.data.table #' @importFrom purrr map_dfr #' @importFrom dplyr collect select mutate +#' @importFrom globals findGlobals #' @return collect return a data.frame/data.table #' @examples #' cars.df = as.disk.frame(cars) @@ -26,7 +27,36 @@ collect.summarized_disk.frame <- function(x, ..., parallel = !is.null(attr(x, "recordings"))) { dotdotdot <- attr(x, 'summarize_code') + group_by_vars = attr(x, "group_by_cols") + + # look at the group by and summaries codes and figure out which columns need to be + # srckeep + df_to_find_cols = fst::read_fst(get_chunk_ids(x, full.names = TRUE)[1], from=1, to=1) + + cols_in_summ = lapply(dotdotdot, function(one) { + globals::findGlobals(one, envir = list2env(df_to_find_cols, parent=globalenv())) + }) %>% unlist %>% unique + + cols_in_group_by = lapply(group_by_vars, function(one) { + globals::findGlobals(one, envir = list2env(df_to_find_cols, parent=globalenv())) + }) %>% unlist %>% unique + + cols_used = c(cols_in_summ, cols_in_group_by) %>% unique + src_keep_cols = intersect(names(df_to_find_cols), cols_used) + + # are there any variables used in the group by or summarise that is not present in the original data? + # if yes then that indicates this could be more complicated e.g. a new var was created with mutate + extra_vars = setdiff(cols_used, names(df_to_find_cols)) + if(length(extra_vars) > 0) { + warning(sprintf( + "These columns that appear in the group-by and summarise does not appear in the original data set: %s. This set of action is too hard for disk.frame to figure out the `srckeep` automatically, you must do the `srckeep` manually." + , paste0(extra_vars, collapse = ", "))) + } else { + x = srckeep(x, src_keep_cols) + } + + # make a copy dotdotdot_chunk_agg <- dotdotdot dotdotdot_collected_agg <- dotdotdot @@ -74,8 +104,18 @@ collect.summarized_disk.frame <- } } + group_by_globals_list = attr(x_as.disk.frame, "group_by_globals_and_pkgs")$globals + + if(is.null(group_by_globals_list)) { + eval_clos = parent.frame() + } else { + eval_clos = list2env(group_by_globals_list, parent=parent.frame()) + } + # TODO add appropriate environment - tmp_df = eval(first_stage_code) + # tmp_df = eval(first_stage_code, envir=environment(), enclos = eval_clos + tmp_df = eval(first_stage_code, group_by_globals_list) + n_summ_args = length(dotdotdot_chunk_agg) @@ -103,7 +143,9 @@ collect.summarized_disk.frame <- ) } - tmp2 = collect(eval(parse(text = chunk_summ_code_str))) + summarize_globals_list = attr(x_as.disk.frame, "summarize_globals_and_pkgs")$globals + + tmp2 = collect(eval(parse(text = chunk_summ_code_str), envir = summarize_globals_list)) second_stage_code = eval(parse(text = sprintf( "quote(group_by(tmp2, %s))", paste0(rep_len("NULL", n_grp_args), collapse = ", ") @@ -111,7 +153,12 @@ collect.summarized_disk.frame <- if (n_grp_args >= 1) { for (i in 1:n_grp_args) { - second_stage_code[[i + 2]] = group_by_vars[[i]] + second_stage_code[[i + 2]] = group_by_vars[[i]] %>% + deparse() %>% + paste0(collapse="") %>% + sprintf("`%s`", .) %>% + parse(text=.) %>% + .[[1]] } } diff --git a/R/data.table.r b/R/data.table.r index 0d12184f..9d648121 100644 --- a/R/data.table.r +++ b/R/data.table.r @@ -24,42 +24,7 @@ code = substitute(chunk[...]) - # sometimes the arguments could be empty - # in a recent version of globals that would cause a fail - # to avoid the fail remove them from the test - #dotdotdot_for_find_global = dotdotdot[!sapply(sapply(dotdotdot, as.character), function(x) all(unlist(x) == ""))] - - #ag = globals::findGlobals(dotdotdot_for_find_global) - #ag = setdiff(ag, "") # "" can cause issues with future # this line no longer needed - - - # you need to use list otherwise the names will be gone - if (paste0(deparse(code), collapse="") == "chunk_fn(NULL)") { - globals_and_pkgs = future::getGlobalsAndPackages(expression(chunk_fn())) - } else { - globals_and_pkgs = future::getGlobalsAndPackages(code) - } - - - global_vars = globals_and_pkgs$globals - - env = parent.frame() - - done = identical(env, emptyenv()) || identical(env, globalenv()) - - # keep adding global variables by moving up the environment chain - while(!done) { - tmp_globals_and_pkgs = future::getGlobalsAndPackages(code, envir = env) - new_global_vars = tmp_globals_and_pkgs$globals - for (name in setdiff(names(new_global_vars), names(global_vars))) { - global_vars[[name]] <- new_global_vars[[name]] - } - - done = identical(env, emptyenv()) || identical(env, globalenv()) - env = parent.env(env) - } - - globals_and_pkgs$globals = global_vars + globals_and_pkgs = find_globals_recursively(code, parent.frame()) res = future.apply::future_lapply(get_chunk_ids(df, full.names = TRUE), function(chunk_id) { #res = lapply(get_chunk_ids(df, full.names = TRUE), function(chunk_id) { diff --git a/R/disk.frame-to-parquet.r b/R/disk.frame-to-parquet.r new file mode 100644 index 00000000..d9ec2e45 --- /dev/null +++ b/R/disk.frame-to-parquet.r @@ -0,0 +1,35 @@ +#' A function to convert a disk.frame to parquet format +#' @importFrom arrow write_parquet +#' @export +#' @param df a disk.frame or a path to a disk.frame +#' @param outdir the path to save the parquet files +disk.frame_to_parquet <- function(df, outdir) { + if("disk.frame" %in% class(df)) { + path = attr(df, "path") + } else { + path = df + } + + path = normalizePath(path) + fst_files = normalizePath(list.files(path, "fst$", full.names = TRUE, recursive=TRUE)) + + if(!fs::dir_exists(outdir)) { + fs::dir_create(outdir) + } + + + future.apply::future_lapply(fst_files, function(fst_file) { + file_name = basename(fst_file) + file_name = paste0(stringr::str_sub(file_name, 1, nchar(file_name)-4), ".parquet") + path_name = normalizePath(dirname(fst_file)) + + # remove base directory from path + path_name = stringr::str_sub(path_name, nchar(path)+1) + + if(!fs::dir_exists(file.path(outdir, path_name))) { + fs::dir_create(file.path(outdir, path_name)) + } + + arrow::write_parquet(fst::read_fst(fst_file), file.path(outdir, path_name, file_name)) + }) +} \ No newline at end of file diff --git a/R/dplyr_verbs.r b/R/dplyr_verbs.r index 285c678f..49586774 100644 --- a/R/dplyr_verbs.r +++ b/R/dplyr_verbs.r @@ -137,6 +137,7 @@ chunk_ungroup = create_chunk_mapper(dplyr::ungroup) #ungroup.disk.frame( < - create_dplyr_mapper(dplyr::ungroup, , warning_msg="`ungroup.disk.frame` is now deprecated. Please use `chunk_ungroup` instead. This is in preparation for a more powerful `group_by` framework") + #' @export #' @rdname dplyr_verbs glimpse.disk.frame <- function(.data, ...) { diff --git a/R/get_chunk.r b/R/get_chunk.r index 01c06e61..c0125985 100644 --- a/R/get_chunk.r +++ b/R/get_chunk.r @@ -4,6 +4,7 @@ #' @param keep the columns to keep #' @param full.names whether n is the full path to the chunks or just a relative path file name. Ignored if n is numeric #' @param ... passed to fst::read_fst or whichever read function is used in the backend +#' @param partitioned_info for internal use only. It's a data.frame used to help with filtering by partitions #' @export #' @examples #' cars.df = as.disk.frame(cars, nchunks = 2) @@ -24,7 +25,7 @@ get_chunk <- function(...) { #' @rdname get_chunk #' @importFrom fst read_fst #' @export -get_chunk.disk.frame <- function(df, n, keep = NULL, full.names = FALSE, ...) { +get_chunk.disk.frame <- function(df, n, keep = NULL, full.names = FALSE, ..., partitioned_info=NULL) { stopifnot("disk.frame" %in% class(df)) # keep_chunks = attr(df, "keep_chunks", exact=TRUE) @@ -44,9 +45,10 @@ get_chunk.disk.frame <- function(df, n, keep = NULL, full.names = FALSE, ...) { keep1_vars = paste0(keep1, collapse = ", ") keep_no_good_vars = setdiff(keep, keep1) %>% paste0(collapse = ", ") stop( - glue::glue( - "This disk.frame has a srckeep containing these variables {keep1_vars}. - You are trying to keep {keep_no_good_vars}, which are not available.")) + sprintf( + "This disk.frame has a `srckeep` containing these columns: `%s`. + You are trying to keep `%s`, which are not available.", + paste0(keep1_vars, collapse=", "), paste0(keep_no_good_vars, collapse=", "))) } keep = intersect(keep1, keep) if (!all(keep %in% keep1)) { @@ -78,19 +80,37 @@ get_chunk.disk.frame <- function(df, n, keep = NULL, full.names = FALSE, ...) { return(notbl) } - if (is.null(recordings)) { if(typeof(keep)=="closure") { - fst::read_fst(filename, as.data.table = TRUE,...) + tmp = fst::read_fst(filename, as.data.table = TRUE,...) } else { - fst::read_fst(filename, columns = keep, as.data.table = TRUE,...) + tmp = fst::read_fst(filename, columns = keep, as.data.table = TRUE,...) + } + + if(!is.null(partitioned_info)) { + res = tmp %>% + mutate(.check=1) %>% + full_join(partitioned_info %>% mutate(.check=1), by=".check") %>% + select(-.check, -fullpath, -.disk.frame.sub.path) + return(res) + } else{ + return(tmp) } } else { - if(typeof(keep)!="closure") { - play(fst::read_fst(filename, as.data.table = TRUE,...), recordings) + if(typeof(keep)=="closure") { + tmp_df_input = fst::read_fst(filename, as.data.table = TRUE,...) } else { - play(fst::read_fst(filename, columns = keep, as.data.table = TRUE,...), recordings) - + tmp_df_input = fst::read_fst(filename, columns = keep, as.data.table = TRUE,...) + } + + if(!is.null(partitioned_info)) { + res = tmp_df_input %>% + mutate(.check=1) %>% + full_join(partitioned_info %>% mutate(.check=1), by=".check") %>% + select(-.check, -fullpath, -.disk.frame.sub.path) + return(play(res, recordings)) + } else{ + return(play(tmp_df_input, recordings)) } } } diff --git a/R/get_chunk_ids.r b/R/get_chunk_ids.r index bbb3890a..9a023e90 100644 --- a/R/get_chunk_ids.r +++ b/R/get_chunk_ids.r @@ -20,19 +20,19 @@ #' # clean up cars.df #' delete(cars.df) get_chunk_ids <- function(df, ..., full.names = FALSE, strip_extension = TRUE) { - lf = list.files(attr(df,"path"), full.names = full.names, ...) + stopifnot("disk.frame" %in% class(df)) + + lf = list.files(attr(df,"path"), full.names = full.names, ..., recursive = TRUE) if(full.names) { return(lf) } - purrr::map_chr(lf, ~{ - tmp = stringr::str_split(.x,stringr::fixed("."), simplify = TRUE) - l = length(tmp) - if(l == 1) { - return(tmp) - } else if(strip_extension) { - paste0(tmp[-l], collapse="") - } else { - .x + + # strip out the path or file name if required + sapply(lf, function(path) { + tmp = basename(path) + if (strip_extension) { + tmp = tools::file_path_sans_ext(tmp) } + return(tmp) }) } diff --git a/R/get_partition.r b/R/get_partition.r new file mode 100644 index 00000000..b43e68f7 --- /dev/null +++ b/R/get_partition.r @@ -0,0 +1,71 @@ +#' Turn a string of the form /partion1=val/partion2=val2 into data.frame +#' @param path_strs The paths in string form to break into partition format +split_string_into_df <- function(path_strs) { + paths = dirname(path_strs) %>% unique + list_of_partitions = stringr::str_split(paths, "/") + + tmp = mapply(function(partition, path) { + part_val = stringr::str_split(partition, "=") + tmp = lapply(part_val, function(part_val, lvl) { + tmp = data.frame(partition = part_val[2]) + names(tmp) = part_val[1] + tmp + }) %>% + do.call(cbind, .) + + tmp$.disk.frame.sub.path = path + + tmp + }, list_of_partitions, paths, SIMPLIFY = FALSE) %>% data.table::rbindlist() + + tmp +} + +if(F) { + df = disk.frame("C:/temp/ok.df") %>% + filter(partition == 1) +} + +#' Get the partitioning structure of a folder +#' @param df a disk.frame whose paths will be used to determine if it's +#' folder-partitioned disk.frame +#' @importFrom utils type.convert +get_partition_paths <- function(df) { + stopifnot("disk.frame" %in% class(df)) + path = tools::file_path_as_absolute(attr(df, "path")) + + allowed_paths = path + + meta_path = file.path(path, ".metadata") + + is_partitioned = FALSE + df_of_partitions = NULL + + # if it's a partitioned structure allow more search paths than root + if (length(setdiff(list.dirs(path, recursive = FALSE), meta_path)) >= 1) { + lf = list.files(path, full.names = FALSE, pattern="fst", recursive=TRUE) + + # create a data.frame of the paths so it can be filtered + df_of_partitions = split_string_into_df(lf) + # infer the types + df_of_partitions = utils::type.convert(df_of_partitions, as.is=TRUE) + + # if there is a filter operation, filter the above to figure out + allowed_paths = df_of_partitions$.disk.frame.sub.path + + # filter for some paths if necessary + partition_filter_info = attr(df, "partition_filter") + if (!is.null(partition_filter_info)) { + # apply filter + df_of_partitions = eval(partition_filter_info$expr, list(dataframe=df_of_partitions)) + + allowed_paths = df_of_partitions$.disk.frame.sub.path + } + is_partitioned = TRUE + } + + # now go through the allowed paths + list(paths=file.path(attr(df, "path"), allowed_paths), is_partitioned=is_partitioned, df = df_of_partitions) + + # TODO check all files sit within the same structure +} diff --git a/R/is_disk.frame.r b/R/is_disk.frame.r index 542bd38b..48f538da 100644 --- a/R/is_disk.frame.r +++ b/R/is_disk.frame.r @@ -10,24 +10,30 @@ #' # clean up cars.df #' delete(cars.df) is_disk.frame <- function(df) { - ##browser if("disk.frame" %in% class(df)) { df = attr(df, "path", exact=TRUE) } else if(!"character" %in% class(df)) { # character then check the path return(FALSE) } - files <- fs::dir_ls(df, type="file", all = TRUE) + files <- fs::dir_ls(df, type="file", all = TRUE) # if all files are fst if(length(files)>0) { if(any(purrr::map_lgl(files, ~length(grep(glob2rx("*.fst"), .x)) == 0))) { - # some of the fiels do not have a .fst extension + # some of the files do not have a .fst extension return(FALSE) } } dirs = fs::dir_ls(df, type="directory", all = TRUE) if(length(dirs) > 1) { + # are the directories of this form name=val + split_dirs = stringr::str_split(basename(setdiff(dirs, file.path(df, ".metadata"))), "=") + if (all(sapply(split_dirs, length) == 2)) { + # all folder are + # TODO check the folder recursively + return(TRUE) + } return(FALSE) } else if(length(dirs) == 1) { if(substr(dirs, nchar(dirs)-8,nchar(dirs)) != ".metadata") { diff --git a/R/names.r b/R/names.r index 05d047da..9f4393a3 100644 --- a/R/names.r +++ b/R/names.r @@ -25,17 +25,17 @@ names.disk.frame <- function(x, ...) { #' @export colnames.disk.frame <- function(x, ...) { res = attr(x, "path", exact=TRUE) %>% - list.files(full.names = TRUE) + list.files(full.names = TRUE, recursive=TRUE, pattern = "fst") - if(is.null(attr(x, "recordings"))) { + # if(is.null(attr(x, "recordings"))) { if(length(res) == 0) { return(vector("character")) } return(fst::metadata_fst(res[1])$columnNames) - } else { - tiny_example_data.frame = get_chunk(x, 1, from=1, to=1) - return(colnames(tiny_example_data.frame)) - } + # } else { + # tiny_example_data.frame = get_chunk(x, 1, from=1, to=1) + # return(colnames(tiny_example_data.frame)) + # } } diff --git a/R/nchunks.r b/R/nchunks.r index d40eb6e6..17c89888 100644 --- a/R/nchunks.r +++ b/R/nchunks.r @@ -36,7 +36,7 @@ nchunks.disk.frame <- function(df, skip.ready.check = FALSE, ...) { #if(!skip.ready.check) stopifnot(is_ready(df)) fpath <- attr(df,"path", exact=TRUE) if(is.dir.disk.frame(df)) { - return(length(fs::dir_ls(fpath, type="file"))) + return(length(list.files(fpath, pattern="fst", recursive = TRUE))) } else { return(1) } diff --git a/R/ncol-nrow.r b/R/ncol-nrow.r index 1addc2dc..3f9e17c2 100644 --- a/R/ncol-nrow.r +++ b/R/ncol-nrow.r @@ -29,7 +29,7 @@ nrow.disk.frame <- function(df, ...) { stopifnot(is_ready(df)) path1 <- attr(df,"path", exact=TRUE) if(dir.exists(path1)) { - path2 <- list.files(path1,full.names = TRUE) + path2 <- list.files(path1, full.names = TRUE, recursive=TRUE, pattern = "fst") if(length(path2) == 0) { return(0) } diff --git a/R/one-stage-verbs.R b/R/one-stage-verbs.R index 27f62f2f..39d9b350 100644 --- a/R/one-stage-verbs.R +++ b/R/one-stage-verbs.R @@ -209,19 +209,31 @@ IQR_df.collected_agg.disk.frame <- function(listx, ...) { #' @rdname group_by #' @export summarise.grouped_disk.frame <- function(.data, ...) { + class(.data) <- c("summarized_disk.frame", "disk.frame") + # get all components of the summarise dotdotdot = rlang::enexprs(...) # convert any quosure to labels for (i in seq_along(dotdotdot)) { - if("quosure" %in% class(dotdotdot[[i]])) { - dotdotdot[[i]] <- rlang::sym(rlang::as_label(dotdotdot[[i]])) - } + dotdotdot[[i]] <- rlang::quo_squash(dotdotdot[[i]]) } - class(.data) <- c("summarized_disk.frame", "disk.frame") attr(.data, "summarize_code") = dotdotdot + # detect any global variables + args_str = sapply(dotdotdot, function(code) { + deparse(code) %>% + paste0(collapse="") + }) %>% paste(collapse = ", ") + + + attr(.data, "summarize_globals_and_pkgs") = + find_globals_recursively( + parse(text=sprintf("list(%s)", args_str))[[1]], + parent.frame() + ) + return(.data) } @@ -246,22 +258,28 @@ summarize.grouped_disk.frame = summarise.grouped_disk.frame #' @rdname group_by # learning from https://docs.dask.org/en/latest/dataframe-groupby.html group_by.disk.frame <- function(.data, ..., .add = FALSE, .drop = stop("disk.frame does not support `.drop` in `group_by` at this stage")) { - class(.data) <- c("grouped_disk.frame", "disk.frame") - # using rlang is a neccesary evil here as I need to deal with !!! that is supported by group_by etc + # using rlang is a necessary evil here as I need to deal with !!! that is supported by group_by etc group_by_cols = rlang::enexprs(...) # convert any quosure to labels for (i in seq_along(group_by_cols)) { - if("quosure" %in% class(group_by_cols[[i]])) { - group_by_cols[[i]] <- rlang::sym(rlang::as_label(group_by_cols[[i]])) - } + group_by_cols[[i]] <- group_by_cols[[i]] %>% + rlang::quo_squash() } - attr(.data, "group_by_cols") = group_by_cols + # detect any global variables + args_str = sapply(group_by_cols, function(code) { + deparse(code) %>% + paste0(collapse="") + }) %>% paste(collapse = ", ") + + + attr(.data, "group_by_globals_and_pkgs") = find_globals_recursively(parse(text=sprintf("list(%s)", args_str))[[1]], parent.frame()) + .data } @@ -270,19 +288,33 @@ group_by.disk.frame <- function(.data, ..., .add = FALSE, .drop = stop("disk.fra #' @importFrom dplyr summarize #' @rdname group_by summarize.disk.frame <- function(.data, ...) { + + class(.data) <- c("summarized_disk.frame", "disk.frame") + # get all components of the summarise dotdotdot = rlang::enexprs(...) # convert any quosure to labels for (i in seq_along(dotdotdot)) { - if("quosure" %in% class(dotdotdot[[i]])) { - dotdotdot[[i]] <- rlang::sym(rlang::as_label(dotdotdot[[i]])) - } + dotdotdot[[i]] <- dotdotdot[[i]] %>% + rlang::quo_squash() } - class(.data) <- c("summarized_disk.frame", "disk.frame") attr(.data, "summarize_code") = dotdotdot + # detect any global variables + args_str = sapply(dotdotdot, function(code) { + deparse(code) %>% + paste0(collapse="") + }) %>% paste(collapse = ", ") + + + attr(.data, "summarize_globals_and_pkgs") = + find_globals_recursively( + parse(text=sprintf("list(%s)", args_str))[[1]], + parent.frame() + ) + return(.data) } diff --git a/R/overwrite_check.r b/R/overwrite_check.r index f0651315..eabcf92b 100644 --- a/R/overwrite_check.r +++ b/R/overwrite_check.r @@ -15,7 +15,6 @@ #' # clean up #' fs::dir_delete(tf) overwrite_check <- function(outdir, overwrite) { - ##browser if (is.null(outdir)) { warning("outdir is NULL; no overwrite check is performed") return(NULL) @@ -30,7 +29,7 @@ overwrite_check <- function(outdir, overwrite) { fs::dir_delete(outdir) }, error = function(e) { message(e) - stop(glue::glue("Failed to delete the directory {outdir} in preparation for overwrite, this could be due to many reason and may be a genuine bug. Firstly, though, please ensure you do not have the folder open by Explorer (Windows) or other file management systems")) + stop(glue::glue("Failed to delete the directory {outdir} in preparation for overwrite, this could be due to many reasons and may be a genuine bug. Firstly, though, please ensure you do not have the folder open by Explorer (Windows) or other file management systems")) }) } diff --git a/R/partition-filter.r b/R/partition-filter.r new file mode 100644 index 00000000..6d754f2e --- /dev/null +++ b/R/partition-filter.r @@ -0,0 +1,13 @@ +#' Filter the dataset based on folder partitions +#' @param x a disk.frame +#' @param ... filtering conditions for filtering the disk.frame at (folder) partition level +#' @importFrom dplyr filter +#' @export +partition_filter <- function(x, ...) { + expr = bquote(dplyr::filter(dataframe, .(substitute(...)))) + globals = find_globals_recursively(expr, parent.frame()) + + attr(x, "partition_filter") = list(expr=expr, globals=globals) + + return(x) +} diff --git a/R/recommend_nchunks.r b/R/recommend_nchunks.r index 2b987600..bb59aec4 100644 --- a/R/recommend_nchunks.r +++ b/R/recommend_nchunks.r @@ -1,3 +1,4 @@ + #' Recommend number of chunks based on input size #' @description Computes the recommended number of chunks to break a data.frame #' into. It can accept filesizes in bytes (as integer) or a data.frame @@ -22,7 +23,6 @@ #' # recommend nchunks based on file size ONLY CSV is implemented at the moment #' recommend_nchunks(1024^3) recommend_nchunks <- function(df, type = "csv", minchunks = data.table::getDTthreads(), conservatism = 8, ram_size = df_ram_size()) { - dfsize = 0 if ("data.frame" %in% class(df)) { # the df's size in gigabytes @@ -55,7 +55,6 @@ recommend_nchunks <- function(df, type = "csv", minchunks = data.table::getDTthr #' # returns the RAM size in gigabyte (GB) #' df_ram_size() df_ram_size <- function() { - #browser() tryCatch({ ram_size = NULL # the amount of memory available in gigabytes @@ -119,10 +118,10 @@ df_ram_size <- function() { ram_size = benchmarkme::get_ram()/1024^3 if(is.na(ram_size)) { - warning("RAM size can't be determined. Assume you have 16GB of RAM.") - warning("Please report this error at github.com/xiaodaigh/disk.frame/issues") - warning(glue::glue("Please include your operating system, R version, and if using RStudio the Rstudio version number")) - return(16) + warning("RAM size can't be determined. Assume you have 16GB of RAM.") + warning("Please report this error at github.com/xiaodaigh/disk.frame/issues") + warning(glue::glue("Please include your operating system, R version, and if using RStudio the Rstudio version number")) + return(16) } else { ram_size = max(ram_size, 1, na.rm = TRUE) return(ram_size) diff --git a/R/util.r b/R/util.r index 8b760f0f..03987c4b 100644 --- a/R/util.r +++ b/R/util.r @@ -45,3 +45,36 @@ purrr_as_mapper <- function(.f) { } return(.f) } + +#' Find globals in an expression by searching through the chain +#' @param code An expression to search for globals +#' @param envir The environment from which to begin the search +find_globals_recursively <- function(code, envir) { + globals_and_pkgs = future::getGlobalsAndPackages(code, envir) + + global_vars = globals_and_pkgs$globals + + done = identical(envir, emptyenv()) || identical(envir, globalenv()) + + if(done) return(globals_and_pkgs) + + env = parent.env(envir) + + done = identical(env, emptyenv()) || identical(env, globalenv()) + + # keep adding global variables by moving up the environment chain + while(!done) { + tmp_globals_and_pkgs = future::getGlobalsAndPackages(code, envir = env) + new_global_vars = tmp_globals_and_pkgs$globals + for (name in setdiff(names(new_global_vars), names(global_vars))) { + global_vars[[name]] <- new_global_vars[[name]] + } + + done = identical(env, emptyenv()) || identical(env, globalenv()) + env = parent.env(env) + } + + globals_and_pkgs$globals = global_vars + + return(globals_and_pkgs) +} \ No newline at end of file diff --git a/R/write_disk.frame.r b/R/write_disk.frame.r index cdfd0c36..7d38e0c4 100644 --- a/R/write_disk.frame.r +++ b/R/write_disk.frame.r @@ -7,13 +7,12 @@ #' @param nchunks number of chunks #' @param overwrite overwrite output directory #' @param shardby the columns to shard by +#' @param partitionby the columns to (folder) partition by #' @param compress compression ratio for fst files -#' @param shardby_function splitting of chunks: "hash" for hash function or "sort" for semi-sorted chunks -#' @param sort_splits for the "sort" shardby function, a dataframe with the split values. -#' @param desc_vars for the "sort" shardby function, the variables to sort descending. #' @param ... passed to cmap.disk.frame #' @export #' @import fst fs +#' @importFrom dplyr group_map #' @importFrom glue glue #' @examples #' cars.df = as.disk.frame(cars) @@ -33,18 +32,67 @@ write_disk.frame <- function( nchunks.disk.frame(diskf), recommend_nchunks(diskf)), overwrite = FALSE, - shardby=NULL, compress = 50, shardby_function="hash", sort_splits=NULL, desc_vars=NULL, ...) { + shardby=NULL, + partitionby=NULL, + compress = 50, ...) { force(nchunks) overwrite_check(outdir, overwrite) - if(is.null(outdir)) { stop("write_disk.frame error: outdir must not be NULL") } if(is_disk.frame(diskf)) { - if(is.null(shardby)) { + if(!is.null(partitionby)) { + + # for each chunk group by the partionby and then write out a partitioned disk.frame for each chunk + list_of_paths = diskf %>% + cimap(~{ + tmp_dir_to_write = tempfile(as.character(.y)) + tmp = .x %>% + group_by(!!!syms(partitionby)) %>% + dplyr::group_map(~{ + # convert group keys to path + tmp_path = lapply(names(.y), function(n) { + sprintf("%s=%s", n, .y[, n]) + }) %>% + do.call(file.path, .) + + final_tmp_path = file.path(tmp_dir_to_write, tmp_path) + as.disk.frame(.x, final_tmp_path, overwrite = FALSE) + }) + return(tmp_dir_to_write) + }, lazy=FALSE) + + # for each of the chunks, do a soft row-append + partitioned_files = lapply(list_of_paths, function(path) { + # each path is a partitioned disk.frame + files = list.files(path, full.names = TRUE, recursive=TRUE) + tmp = data.frame(partition_path = files %>% + dirname %>% + sapply(tools::file_path_as_absolute) %>% + stringr::str_sub(nchar(path)+2)) + tmp = tmp %>% mutate(path=path, files=files) + + tmp + }) %>% rbindlist + + partitioned_files %>% + group_by(partition_path) %>% + dplyr::group_map(function(df, grp) { + mapply(function(file, i) { + outfile = file.path(outdir, grp$partition_path, paste0(i, ".fst")) + if(!dir.exists(file.path(outdir, grp$partition_path))) { + fs::dir_create(file.path(outdir, grp$partition_path)) + } + fs::file_move(file, outfile) + }, df$files, seq_along(df$files)) + }) + + return(disk.frame(outdir)) + + } else if(is.null(shardby)) { path = attr(diskf, "path") files_shortname <- list.files(path) cids = get_chunk_ids(diskf, full.names = T, strip_extension = F) @@ -69,8 +117,6 @@ write_disk.frame <- function( overwrite = TRUE, shardby = shardby, compress = compress, - shardby_function=shardby_function, - sort_splits=sort_splits, desc_vars=desc_vars, ... ) } diff --git a/R/zzz.r b/R/zzz.r index 7c82cd92..02b5febf 100644 --- a/R/zzz.r +++ b/R/zzz.r @@ -5,15 +5,35 @@ #' @importFrom future nbrOfWorkers #' @importFrom crayon red blue green .onAttach <- function(libname, pkgname) { - #setup_disk.frame() - packageStartupMessage( - crayon::red( + crayon::blue( +"\n\n Thank you for using {disk.frame}. However {disk.frame} has been soft-deprecated. You are recommended to switch over to using the {arrow} package for handling larger-than-RAM data. You can convert your existing disk.frames to the parquet format which {arrow} can use by using:\n\n +``` +disk.frame::disk.frame_to_parquet(path.to.your.disk.frame, parquet_path) +```` + +Once done you can use {arrow}'s dataset features to manipulate the larger-than-RAM data using dplyr verbs. E.g. + +``` +dataset = arrow::open_dataset(parquet_path) + +parquet_path |> + mutate(...) |> + group_by(...) |> + summarize(...) |> + collect(...) +``` +" + ), + crayon::red( glue::glue( - "\n\n## Message from disk.frame: + "\n\n## Message from disk.frame: We have {future::nbrOfWorkers()} workers to use with disk.frame. -To change that, use setup_disk.frame(workers = n) or just setup_disk.frame() to use the defaults.")), - crayon::green("\n\n +To change that, use setup_disk.frame(workers = n) or just setup_disk.frame() to use the defaults." + ) + ), + crayon::green( + "\n\n It is recommended that you run the following immediately to set up disk.frame with multiple workers in order to parallelize your operations:\n\n ```r # this will set up disk.frame with multiple workers @@ -21,7 +41,8 @@ setup_disk.frame() # this will allow unlimited amount of data to be passed from worker to worker options(future.globals.maxSize = Inf) ``` -\n\n")) +\n\n" + )) } globalVariables(c( @@ -58,7 +79,13 @@ globalVariables(c( "pathB", "w", "xid", - "yid")) + "yid", + "paths", + ".disk.frame.sub.path", + "fullpath", + ".check", + "partition_path" + )) #' @useDynLib disk.frame #' @importFrom Rcpp evalCpp diff --git a/cran-comments.md b/cran-comments.md index 05b563d8..245c6d74 100644 --- a/cran-comments.md +++ b/cran-comments.md @@ -1,14 +1,12 @@ -## Resubmission 1 for v0.6.0 -* removed the vignette builder which should remove the 1 NOTE - -## Submission for v0.6.0 -* Updated the API for many functions leading to semver update of functions +## Submission for v0.7 +* Implemented partition by folder +* Updated R version to v4.0 ## Test environments * local Windows 11 Pro install, R 4.1.2 -* local Windows 11 Pro install, R devel (as of 2022-01-31) +* local Windows 11 Pro install, R devel (as of 2022-02-01) * local Linux/Ubuntu install, R 4.1.2 -* local Linux/Ubuntu install, R devel (as of 2022-01-31) +* local Linux/Ubuntu install, R devel (as of 2022-02-01) ## R CMD check results There were no ERRORs nor WARNINGs nor NOTE when run locally. diff --git a/docs/404.html b/docs/404.html index 9ccac74e..49d167f0 100644 --- a/docs/404.html +++ b/docs/404.html @@ -32,7 +32,7 @@ disk.frame - 0.6.0 + 0.7 diff --git a/docs/LICENSE-text.html b/docs/LICENSE-text.html index 5757cc4b..d3b2e167 100644 --- a/docs/LICENSE-text.html +++ b/docs/LICENSE-text.html @@ -17,7 +17,7 @@ disk.frame - 0.6.0 + 0.7 diff --git a/docs/articles/01-intro.html b/docs/articles/01-intro.html index af353427..35e9a2b4 100644 --- a/docs/articles/01-intro.html +++ b/docs/articles/01-intro.html @@ -33,7 +33,7 @@ disk.frame - 0.6.0 + 0.7 diff --git a/docs/articles/02-intro-disk-frame.html b/docs/articles/02-intro-disk-frame.html index 5526ed10..adfe01a1 100644 --- a/docs/articles/02-intro-disk-frame.html +++ b/docs/articles/02-intro-disk-frame.html @@ -33,7 +33,7 @@ disk.frame - 0.6.0 + 0.7 @@ -555,26 +555,26 @@

Sampling
 flights.df %>% sample_frac(0.01) %>% collect %>% head
 #>    year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
-#> 1: 2013     1  10     1614           1605         9     1926           1944
-#> 2: 2013     5  13     1136           1136         0     1225           1234
-#> 3: 2013     5  16     1428           1420         8     1535           1538
-#> 4: 2013    12  29     1536           1520        16     1817           1750
-#> 5: 2013    12  21      849            820        29     1315           1345
-#> 6: 2013    12  25     1624           1559        25     1846           1825
+#> 1: 2013     5  24     1452           1455        -3     1840           1820
+#> 2: 2013     1  10      939            905        34     1259           1235
+#> 3: 2013     2   4     1911           1915        -4     2117           2111
+#> 4: 2013     5  12     1634           1545        49     1940           1916
+#> 5: 2013     1  16     2315           2250        25       40              7
+#> 6: 2013    12  28      913            900        13     1215           1227
 #>    arr_delay carrier flight tailnum origin dest air_time distance hour minute
-#> 1:       -18      DL   1508  N952DL    JFK  RSW      163     1074   16      5
-#> 2:        -9      EV   3830  N13955    EWR  PVD       35      160   11     36
-#> 3:        -3      EV   4284  N11536    EWR  ROC       45      246   14     20
-#> 4:        27      MQ   3553  N520MQ    LGA  XNA      181     1147   15     20
-#> 5:       -30      DL    454  N682DA    JFK  STT      188     1623    8     20
-#> 6:        21      EV   5567  N870AS    LGA  CAE       99      617   15     59
+#> 1:        20      AA   1769  N372AA    JFK  MIA      165     1089   14     55
+#> 2:        24      VX    407  N622VA    JFK  LAX      349     2475    9      5
+#> 3:         6      9E   3525  N903XJ    JFK  ORD      127      740   19     15
+#> 4:        24      DL   1773  N3739P    JFK  SLC      262     1990   15     45
+#> 5:        33      B6     30  N187JB    JFK  ROC       59      264   22     50
+#> 6:       -12      DL    422  N713TW    JFK  LAX      327     2475    9      0
 #>              time_hour
-#> 1: 2013-01-10 21:00:00
-#> 2: 2013-05-13 15:00:00
-#> 3: 2013-05-16 18:00:00
-#> 4: 2013-12-29 20:00:00
-#> 5: 2013-12-21 13:00:00
-#> 6: 2013-12-25 20:00:00
+#> 1: 2013-05-24 18:00:00 +#> 2: 2013-01-10 14:00:00 +#> 3: 2013-02-05 00:00:00 +#> 4: 2013-05-12 19:00:00 +#> 5: 2013-01-17 03:00:00 +#> 6: 2013-12-28 14:00:00

Writing Data diff --git a/docs/articles/03-concepts.html b/docs/articles/03-concepts.html index 5797c66c..95c0d134 100644 --- a/docs/articles/03-concepts.html +++ b/docs/articles/03-concepts.html @@ -33,7 +33,7 @@ disk.frame - 0.6.0 + 0.7

diff --git a/docs/articles/04-ingesting-data.html b/docs/articles/04-ingesting-data.html index d9e14783..fb122f5d 100644 --- a/docs/articles/04-ingesting-data.html +++ b/docs/articles/04-ingesting-data.html @@ -33,7 +33,7 @@ disk.frame - 0.6.0 + 0.7 diff --git a/docs/articles/05-data-table-syntax.html b/docs/articles/05-data-table-syntax.html index 7027aebf..3c67ccc2 100644 --- a/docs/articles/05-data-table-syntax.html +++ b/docs/articles/05-data-table-syntax.html @@ -33,7 +33,7 @@ disk.frame - 0.6.0 + 0.7 diff --git a/docs/articles/06-vs-dask-juliadb.html b/docs/articles/06-vs-dask-juliadb.html index fe2bcd28..e8652bab 100644 --- a/docs/articles/06-vs-dask-juliadb.html +++ b/docs/articles/06-vs-dask-juliadb.html @@ -33,7 +33,7 @@ disk.frame - 0.6.0 + 0.7 @@ -189,14 +189,14 @@

disk.framesystem.time(setup_disk.frame()) # ~4s #> The number of workers available for disk.frame is 6 #> user system elapsed -#> 0.20 0.03 2.32 +#> 0.15 0.04 2.41

We note that there is some time needed for disk.frame to start up all the workers. Next we try to convert the largest CSV file to disk.frame format. The file to be converted is about 2.2GB in size

 time_to_convert_disk.frame = system.time(df1 <- csv_to_disk.frame("c:/data/Performance_2004Q3.txt", header = FALSE))[3]
 
 time_to_convert_disk.frame
 #> elapsed 
-#>    28.3
+#> 29.67

Now that we have converted it, we want to a count by the first column. To achieve this we use a “two-stage” aggregation strategy. Note that use keep="V1" to bring only the column V1 into RAM. This avoids the reading of other unnecessary columns and should speed-up the analysis significantly

 time_to_agg_disk.frame = system.time(summ <- df1[,.N, V1, keep = "V1"][, .(N = sum(N)), V1])
@@ -204,7 +204,7 @@ 
disk.frametime_to_agg_disk.frame #> user system elapsed -#> 0.13 0.03 8.48
+#> 0.08 0.01 8.82

We can inspect the result as well.

 summ
@@ -228,7 +228,7 @@ 
disk.framesummarise(N = n()) %>% collect) #> user system elapsed -#> 1.53 0.17 10.25
+#> 1.64 0.13 4.43

However, the dplyr syntax tends to be slightly slower than using data.table syntax. This may be improved as much of the overhead is due to inefficient use of NSE.

diff --git a/docs/articles/07-glm.html b/docs/articles/07-glm.html index 2b340f3c..01e5f05e 100644 --- a/docs/articles/07-glm.html +++ b/docs/articles/07-glm.html @@ -33,7 +33,7 @@ disk.frame - 0.6.0 + 0.7
diff --git a/docs/articles/08-more-epic.html b/docs/articles/08-more-epic.html index f4170f94..a15ed651 100644 --- a/docs/articles/08-more-epic.html +++ b/docs/articles/08-more-epic.html @@ -33,7 +33,7 @@ disk.frame - 0.6.0 + 0.7 diff --git a/docs/articles/09-convenience-features.html b/docs/articles/09-convenience-features.html index ef7bf2ed..9e2bcfc3 100644 --- a/docs/articles/09-convenience-features.html +++ b/docs/articles/09-convenience-features.html @@ -33,7 +33,7 @@ disk.frame - 0.6.0 + 0.7 diff --git a/docs/articles/10-group-by.html b/docs/articles/10-group-by.html index f18f3b51..eaa55ced 100644 --- a/docs/articles/10-group-by.html +++ b/docs/articles/10-group-by.html @@ -33,7 +33,7 @@ disk.frame - 0.6.0 + 0.7 diff --git a/docs/articles/11-custom-group-by.html b/docs/articles/11-custom-group-by.html index 695f467f..8ca8d702 100644 --- a/docs/articles/11-custom-group-by.html +++ b/docs/articles/11-custom-group-by.html @@ -33,7 +33,7 @@ disk.frame - 0.6.0 + 0.7 diff --git a/docs/articles/88-trouble-shooting.html b/docs/articles/88-trouble-shooting.html index e1ba2f47..cb894f3d 100644 --- a/docs/articles/88-trouble-shooting.html +++ b/docs/articles/88-trouble-shooting.html @@ -33,7 +33,7 @@ disk.frame - 0.6.0 + 0.7 diff --git a/docs/articles/index.html b/docs/articles/index.html index 3e4cba20..ab7832cd 100644 --- a/docs/articles/index.html +++ b/docs/articles/index.html @@ -17,7 +17,7 @@ disk.frame - 0.6.0 + 0.7 diff --git a/docs/authors.html b/docs/authors.html index 97226196..7c8e7588 100644 --- a/docs/authors.html +++ b/docs/authors.html @@ -17,7 +17,7 @@ disk.frame - 0.6.0 + 0.7 @@ -110,13 +110,13 @@

Citation

ZJ D (2022). disk.frame: Larger-than-RAM Disk-Based Data Manipulation Framework. -R package version 0.6.0, https://diskframe.com. +R package version 0.7, https://diskframe.com.

@Manual{,
   title = {disk.frame: Larger-than-RAM Disk-Based Data Manipulation Framework},
   author = {Dai ZJ},
   year = {2022},
-  note = {R package version 0.6.0},
+  note = {R package version 0.7},
   url = {https://diskframe.com},
 }
diff --git a/docs/index.html b/docs/index.html index 7ea2fc4e..f86e68c5 100644 --- a/docs/index.html +++ b/docs/index.html @@ -37,7 +37,7 @@ disk.frame - 0.6.0 + 0.7 @@ -228,6 +228,7 @@

Quick-start# this will setup disk.frame's parallel backend with number of workers equal to the number of CPU cores (hyper-threaded cores are counted as one not two) setup_disk.frame() +#> The number of workers available for disk.frame is 6 # this allows large datasets to be transferred between sessions options(future.globals.maxSize = Inf) @@ -247,12 +248,12 @@

dplyr verbsfilter(year == 2013) %>% mutate(origin_dest = paste0(origin, dest)) %>% head(2) -#> year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time -#> 1: 2013 1 1 517 515 2 830 819 -#> 2: 2013 1 1 533 529 4 850 830 -#> arr_delay carrier flight tailnum origin dest air_time distance hour minute -#> 1: 11 UA 1545 N14228 EWR IAH 227 1400 5 15 -#> 2: 20 UA 1714 N24211 LGA IAH 227 1416 5 29 +#> year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time arr_delay +#> 1: 2013 1 1 517 515 2 830 819 11 +#> 2: 2013 1 1 533 529 4 850 830 20 +#> carrier flight tailnum origin dest air_time distance hour minute +#> 1: UA 1545 N14228 EWR IAH 227 1400 5 15 +#> 2: UA 1714 N24211 LGA IAH 227 1416 5 29 #> time_hour origin_dest #> 1: 2013-01-01 05:00:00 EWRIAH #> 2: 2013-01-01 05:00:00 LGAIAH @@ -380,6 +381,7 @@

Example: data.table syntax.(qtr = ifelse(month <= 3, "Q1", "Q2")) ] ) +#> data.table syntax for disk.frame may be moved to a separate package in the future grp_by_stage1 #> qtr sum_dist @@ -405,24 +407,24 @@

Basic info
 # where is the disk.frame stored
 attr(flights.df, "path")
-#> [1] "C:\\Users\\RTX2080\\AppData\\Local\\Temp\\RtmpQH7obF\\file42d452c32907.df"
+#> [1] "C:\\Users\\RTX2080\\AppData\\Local\\Temp\\RtmpQJt0m9\\file393027da33b.df"

A number of data.frame functions are implemented for disk.frame

 # get first few rows
 head(flights.df, 1)
-#>    year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
-#> 1: 2013     1   1      517            515         2      830            819
-#>    arr_delay carrier flight tailnum origin dest air_time distance hour minute
-#> 1:        11      UA   1545  N14228    EWR  IAH      227     1400    5     15
+#>    year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time arr_delay
+#> 1: 2013     1   1      517            515         2      830            819        11
+#>    carrier flight tailnum origin dest air_time distance hour minute
+#> 1:      UA   1545  N14228    EWR  IAH      227     1400    5     15
 #>              time_hour
 #> 1: 2013-01-01 05:00:00
 # get last few rows
 tail(flights.df, 1)
-#>    year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
-#> 1: 2013     9  30       NA            840        NA       NA           1020
-#>    arr_delay carrier flight tailnum origin dest air_time distance hour minute
-#> 1:        NA      MQ   3531  N839MQ    LGA  RDU       NA      431    8     40
+#>    year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time arr_delay
+#> 1: 2013     9  30       NA            840        NA       NA           1020        NA
+#>    carrier flight tailnum origin dest air_time distance hour minute
+#> 1:      MQ   3531  N839MQ    LGA  RDU       NA      431    8     40
 #>              time_hour
 #> 1: 2013-09-30 08:00:00
diff --git a/docs/news/index.html b/docs/news/index.html
index 48d1c26d..551c9495 100644
--- a/docs/news/index.html
+++ b/docs/news/index.html
@@ -17,7 +17,7 @@
       
       
         disk.frame
-        0.6.0
+        0.7
       
     
@@ -90,6 +90,17 @@

Changelog

Source: NEWS.md +
+ +
+
+ +
@@ -135,12 +135,12 @@

Examples

# add a chunk to diskf add_chunk(diskf, cars) -#> path: "C:\Users\RTX2080\AppData\Local\Temp\RtmpyknGIm/tmp_add_chunk" +#> path: "C:\Users\RTX2080\AppData\Local\Temp\RtmpUX5lOH/tmp_add_chunk" #> nchunks: 1 #> nrow (at source): 50 #> ncol (at source): 2 add_chunk(diskf, cars) -#> path: "C:\Users\RTX2080\AppData\Local\Temp\RtmpyknGIm/tmp_add_chunk" +#> path: "C:\Users\RTX2080\AppData\Local\Temp\RtmpUX5lOH/tmp_add_chunk" #> nchunks: 2 #> nrow (at source): 100 #> ncol (at source): 2 @@ -154,12 +154,12 @@

Examples

# you wish to add multiple chunk in parralel add_chunk(df2, data.frame(chunk=1), 1) -#> path: "C:\Users\RTX2080\AppData\Local\Temp\RtmpyknGIm/tmp_add_chunk2" +#> path: "C:\Users\RTX2080\AppData\Local\Temp\RtmpUX5lOH/tmp_add_chunk2" #> nchunks: 1 #> nrow (at source): 1 #> ncol (at source): 1 add_chunk(df2, data.frame(chunk=2), 3) -#> path: "C:\Users\RTX2080\AppData\Local\Temp\RtmpyknGIm/tmp_add_chunk2" +#> path: "C:\Users\RTX2080\AppData\Local\Temp\RtmpUX5lOH/tmp_add_chunk2" #> nchunks: 2 #> nrow (at source): 2 #> ncol (at source): 1 diff --git a/docs/reference/as.data.frame.disk.frame.html b/docs/reference/as.data.frame.disk.frame.html index aea1586c..e93f1492 100644 --- a/docs/reference/as.data.frame.disk.frame.html +++ b/docs/reference/as.data.frame.disk.frame.html @@ -17,7 +17,7 @@ disk.frame - 0.6.0 + 0.7 diff --git a/docs/reference/as.data.table.disk.frame.html b/docs/reference/as.data.table.disk.frame.html index 1e5146c2..02416086 100644 --- a/docs/reference/as.data.table.disk.frame.html +++ b/docs/reference/as.data.table.disk.frame.html @@ -17,7 +17,7 @@ disk.frame - 0.6.0 + 0.7 diff --git a/docs/reference/as.disk.frame.html b/docs/reference/as.disk.frame.html index fa22a8ea..655a13b1 100644 --- a/docs/reference/as.disk.frame.html +++ b/docs/reference/as.disk.frame.html @@ -17,7 +17,7 @@ disk.frame - 0.6.0 + 0.7 diff --git a/docs/reference/bind_rows.disk.frame.html b/docs/reference/bind_rows.disk.frame.html index 3a336451..0f028244 100644 --- a/docs/reference/bind_rows.disk.frame.html +++ b/docs/reference/bind_rows.disk.frame.html @@ -17,7 +17,7 @@ disk.frame - 0.6.0 + 0.7 diff --git a/docs/reference/chunk_group_by.html b/docs/reference/chunk_group_by.html index f7bbff69..52940b9f 100644 --- a/docs/reference/chunk_group_by.html +++ b/docs/reference/chunk_group_by.html @@ -34,7 +34,7 @@ disk.frame - 0.6.0 + 0.7 diff --git a/docs/reference/cmap.html b/docs/reference/cmap.html index 1a518792..5909c5a5 100644 --- a/docs/reference/cmap.html +++ b/docs/reference/cmap.html @@ -21,7 +21,7 @@ disk.frame - 0.6.0 + 0.7 @@ -257,7 +257,7 @@

Examples

# return the first row of each chunk eagerly as list cmap(cars.df, ~.x[1,], lazy = FALSE) -#> path: "C:\Users\RTX2080\AppData\Local\Temp\RtmpyknGIm\file4718220342f7.df" +#> path: "C:\Users\RTX2080\AppData\Local\Temp\RtmpUX5lOH\file1c385263150e.df" #> nchunks: 6 #> nrow (at source): 50 #> ncol (at source): 2 diff --git a/docs/reference/cmap2.html b/docs/reference/cmap2.html index 336c10b5..5a9bf032 100644 --- a/docs/reference/cmap2.html +++ b/docs/reference/cmap2.html @@ -18,7 +18,7 @@ disk.frame - 0.6.0 + 0.7 diff --git a/docs/reference/collect.html b/docs/reference/collect.html index 0d063a49..358f66f3 100644 --- a/docs/reference/collect.html +++ b/docs/reference/collect.html @@ -20,7 +20,7 @@ disk.frame - 0.6.0 + 0.7 diff --git a/docs/reference/colnames.html b/docs/reference/colnames.html index 210eada5..0956e34d 100644 --- a/docs/reference/colnames.html +++ b/docs/reference/colnames.html @@ -20,7 +20,7 @@ disk.frame - 0.6.0 + 0.7 diff --git a/docs/reference/compute.disk.frame.html b/docs/reference/compute.disk.frame.html index 5e9a9f2c..bf8e3c87 100644 --- a/docs/reference/compute.disk.frame.html +++ b/docs/reference/compute.disk.frame.html @@ -17,7 +17,7 @@ disk.frame - 0.6.0 + 0.7 diff --git a/docs/reference/create_chunk_mapper.html b/docs/reference/create_chunk_mapper.html index 1269b5dd..3a425805 100644 --- a/docs/reference/create_chunk_mapper.html +++ b/docs/reference/create_chunk_mapper.html @@ -17,7 +17,7 @@ disk.frame - 0.6.0 + 0.7 diff --git a/docs/reference/csv_to_disk.frame.html b/docs/reference/csv_to_disk.frame.html index f91d899a..7e59e587 100644 --- a/docs/reference/csv_to_disk.frame.html +++ b/docs/reference/csv_to_disk.frame.html @@ -17,7 +17,7 @@ disk.frame - 0.6.0 + 0.7 diff --git a/docs/reference/delete.html b/docs/reference/delete.html index 9cf26191..5ac2c8de 100644 --- a/docs/reference/delete.html +++ b/docs/reference/delete.html @@ -17,7 +17,7 @@ disk.frame - 0.6.0 + 0.7 diff --git a/docs/reference/df_ram_size.html b/docs/reference/df_ram_size.html index c986617f..898899a8 100644 --- a/docs/reference/df_ram_size.html +++ b/docs/reference/df_ram_size.html @@ -17,7 +17,7 @@ disk.frame - 0.6.0 + 0.7 diff --git a/docs/reference/dfglm.html b/docs/reference/dfglm.html index c9cc7c73..b6b3dd12 100644 --- a/docs/reference/dfglm.html +++ b/docs/reference/dfglm.html @@ -18,7 +18,7 @@ disk.frame - 0.6.0 + 0.7 diff --git a/docs/reference/disk.frame.html b/docs/reference/disk.frame.html index fd1f9556..e4c01f98 100644 --- a/docs/reference/disk.frame.html +++ b/docs/reference/disk.frame.html @@ -17,7 +17,7 @@ disk.frame - 0.6.0 + 0.7 @@ -111,7 +111,7 @@

Arguments

Examples

path = file.path(tempdir(),"cars")
 as.disk.frame(cars, outdir=path, overwrite = TRUE, nchunks = 2)
-#> path: "C:\Users\RTX2080\AppData\Local\Temp\RtmpyknGIm/cars"
+#> path: "C:\Users\RTX2080\AppData\Local\Temp\RtmpUX5lOH/cars"
 #> nchunks: 2
 #> nrow (at source): 50
 #> ncol (at source): 2
diff --git a/docs/reference/dplyr_verbs.html b/docs/reference/dplyr_verbs.html
index 143e9c7e..ff377e41 100644
--- a/docs/reference/dplyr_verbs.html
+++ b/docs/reference/dplyr_verbs.html
@@ -18,7 +18,7 @@
       
       
         disk.frame
-        0.6.0
+        0.7
       
     
diff --git a/docs/reference/evalparseglue.html b/docs/reference/evalparseglue.html index d88bc80a..78e280de 100644 --- a/docs/reference/evalparseglue.html +++ b/docs/reference/evalparseglue.html @@ -17,7 +17,7 @@ disk.frame - 0.6.0 + 0.7 diff --git a/docs/reference/find_globals_recursively.html b/docs/reference/find_globals_recursively.html new file mode 100644 index 00000000..b19e8a83 --- /dev/null +++ b/docs/reference/find_globals_recursively.html @@ -0,0 +1,133 @@ + +Find globals in an expression by searching through the chain — find_globals_recursively • disk.frame + + +
+
+ + + +
+
+ + +
+

Find globals in an expression by searching through the chain

+
+ +
+
find_globals_recursively(code, envir)
+
+ +
+

Arguments

+
code
+

An expression to search for globals

+
envir
+

The environment from which to begin the search

+
+ +
+ +
+ + +
+ +
+

Site built with pkgdown 2.0.2.

+
+ +
+ + + + + + + + diff --git a/docs/reference/foverlaps.disk.frame.html b/docs/reference/foverlaps.disk.frame.html index 6fbc7457..383c9d0a 100644 --- a/docs/reference/foverlaps.disk.frame.html +++ b/docs/reference/foverlaps.disk.frame.html @@ -17,7 +17,7 @@ disk.frame - 0.6.0 + 0.7 diff --git a/docs/reference/gen_datatable_synthetic.html b/docs/reference/gen_datatable_synthetic.html index 8855f4c1..f890a9a7 100644 --- a/docs/reference/gen_datatable_synthetic.html +++ b/docs/reference/gen_datatable_synthetic.html @@ -17,7 +17,7 @@ disk.frame - 0.6.0 + 0.7 diff --git a/docs/reference/get_chunk.html b/docs/reference/get_chunk.html index fffdd1c7..a7e53e5b 100644 --- a/docs/reference/get_chunk.html +++ b/docs/reference/get_chunk.html @@ -17,7 +17,7 @@ disk.frame - 0.6.0 + 0.7 @@ -99,7 +99,7 @@

Obtain one chunk by chunk id

get_chunk(...)
 
 # S3 method for disk.frame
-get_chunk(df, n, keep = NULL, full.names = FALSE, ...)
+get_chunk(df, n, keep = NULL, full.names = FALSE, ..., partitioned_info = NULL)
@@ -114,6 +114,8 @@

Arguments

the columns to keep

full.names

whether n is the full path to the chunks or just a relative path file name. Ignored if n is numeric

+
partitioned_info
+

for internal use only. It's a data.frame used to help with filtering by partitions

diff --git a/docs/reference/get_chunk_ids.html b/docs/reference/get_chunk_ids.html index fc6d82da..ac2822dc 100644 --- a/docs/reference/get_chunk_ids.html +++ b/docs/reference/get_chunk_ids.html @@ -17,7 +17,7 @@ disk.frame - 0.6.0 + 0.7
@@ -117,20 +117,22 @@

Examples

# return the integer-string chunk IDs get_chunk_ids(cars.df) -#> [1] "1" "2" "3" "4" "5" "6" +#> 1.fst 2.fst 3.fst 4.fst 5.fst 6.fst +#> "1" "2" "3" "4" "5" "6" # return the file name chunk IDs get_chunk_ids(cars.df, full.names = TRUE) -#> [1] "C:\\Users\\RTX2080\\AppData\\Local\\Temp\\RtmpyknGIm\\file471836dbe43.df/1.fst" -#> [2] "C:\\Users\\RTX2080\\AppData\\Local\\Temp\\RtmpyknGIm\\file471836dbe43.df/2.fst" -#> [3] "C:\\Users\\RTX2080\\AppData\\Local\\Temp\\RtmpyknGIm\\file471836dbe43.df/3.fst" -#> [4] "C:\\Users\\RTX2080\\AppData\\Local\\Temp\\RtmpyknGIm\\file471836dbe43.df/4.fst" -#> [5] "C:\\Users\\RTX2080\\AppData\\Local\\Temp\\RtmpyknGIm\\file471836dbe43.df/5.fst" -#> [6] "C:\\Users\\RTX2080\\AppData\\Local\\Temp\\RtmpyknGIm\\file471836dbe43.df/6.fst" +#> [1] "C:\\Users\\RTX2080\\AppData\\Local\\Temp\\RtmpUX5lOH\\file1c385fab1acf.df/1.fst" +#> [2] "C:\\Users\\RTX2080\\AppData\\Local\\Temp\\RtmpUX5lOH\\file1c385fab1acf.df/2.fst" +#> [3] "C:\\Users\\RTX2080\\AppData\\Local\\Temp\\RtmpUX5lOH\\file1c385fab1acf.df/3.fst" +#> [4] "C:\\Users\\RTX2080\\AppData\\Local\\Temp\\RtmpUX5lOH\\file1c385fab1acf.df/4.fst" +#> [5] "C:\\Users\\RTX2080\\AppData\\Local\\Temp\\RtmpUX5lOH\\file1c385fab1acf.df/5.fst" +#> [6] "C:\\Users\\RTX2080\\AppData\\Local\\Temp\\RtmpUX5lOH\\file1c385fab1acf.df/6.fst" # return the file name chunk IDs with file extension get_chunk_ids(cars.df, strip_extension = FALSE) -#> [1] "1.fst" "2.fst" "3.fst" "4.fst" "5.fst" "6.fst" +#> 1.fst 2.fst 3.fst 4.fst 5.fst 6.fst +#> "1.fst" "2.fst" "3.fst" "4.fst" "5.fst" "6.fst" # clean up cars.df delete(cars.df) diff --git a/docs/reference/get_partition_paths.html b/docs/reference/get_partition_paths.html new file mode 100644 index 00000000..7dff33ce --- /dev/null +++ b/docs/reference/get_partition_paths.html @@ -0,0 +1,132 @@ + +Get the partitioning structure of a folder — get_partition_paths • disk.frame + + +
+
+ + + +
+
+ + +
+

Get the partitioning structure of a folder

+
+ +
+
get_partition_paths(df)
+
+ +
+

Arguments

+
df
+

a disk.frame whose paths will be used to determine if it's +folder-partitioned disk.frame

+
+ +
+ +
+ + +
+ + + + + + + + diff --git a/docs/reference/group_by.html b/docs/reference/group_by.html index 7149e006..2a026cdb 100644 --- a/docs/reference/group_by.html +++ b/docs/reference/group_by.html @@ -20,7 +20,7 @@ disk.frame - 0.6.0 + 0.7 diff --git a/docs/reference/groups.disk.frame.html b/docs/reference/groups.disk.frame.html index 89dc5d99..d1c2b154 100644 --- a/docs/reference/groups.disk.frame.html +++ b/docs/reference/groups.disk.frame.html @@ -17,7 +17,7 @@ disk.frame - 0.6.0 + 0.7 diff --git a/docs/reference/head_tail.html b/docs/reference/head_tail.html index 01e04939..50bd1d56 100644 --- a/docs/reference/head_tail.html +++ b/docs/reference/head_tail.html @@ -17,7 +17,7 @@ disk.frame - 0.6.0 + 0.7 diff --git a/docs/reference/index.html b/docs/reference/index.html index 7447de77..264deb04 100644 --- a/docs/reference/index.html +++ b/docs/reference/index.html @@ -17,7 +17,7 @@ disk.frame - 0.6.0 + 0.7 @@ -176,6 +176,10 @@

All functions evalparseglue()

Helper function to evalparse some `glue::glue` string

+ +

find_globals_recursively()

+ +

Find globals in an expression by searching through the chain

foverlaps.disk.frame()

@@ -192,6 +196,10 @@

All functions get_chunk_ids()

Get the chunk IDs and files names

+ +

get_partition_paths()

+ +

Get the partitioning structure of a folder

groups(<disk.frame>)

@@ -240,6 +248,10 @@

All functions overwrite_check()

Check if the outdir exists or not

+ +

partition_filter()

+ +

Filter the dataset based on folder partitions

play()

@@ -296,6 +308,10 @@

All functions show_ceremony() ceremony_text() show_boilerplate() insert_ceremony()

Show the code to setup disk.frame

+ +

split_string_into_df()

+ +

Turn a string of the form /partion1=val/partion2=val2 into data.frame

srckeep()

diff --git a/docs/reference/is_disk.frame.html b/docs/reference/is_disk.frame.html index d32d0036..0a09221c 100644 --- a/docs/reference/is_disk.frame.html +++ b/docs/reference/is_disk.frame.html @@ -17,7 +17,7 @@ disk.frame - 0.6.0 + 0.7 diff --git a/docs/reference/join.html b/docs/reference/join.html index 87b1e852..66d1b402 100644 --- a/docs/reference/join.html +++ b/docs/reference/join.html @@ -17,7 +17,7 @@ disk.frame - 0.6.0 + 0.7 diff --git a/docs/reference/make_glm_streaming_fn.html b/docs/reference/make_glm_streaming_fn.html index 63de7ae6..a9e7818c 100644 --- a/docs/reference/make_glm_streaming_fn.html +++ b/docs/reference/make_glm_streaming_fn.html @@ -17,7 +17,7 @@ disk.frame - 0.6.0 + 0.7 diff --git a/docs/reference/merge.disk.frame.html b/docs/reference/merge.disk.frame.html index f93a7f53..08f3ff97 100644 --- a/docs/reference/merge.disk.frame.html +++ b/docs/reference/merge.disk.frame.html @@ -17,7 +17,7 @@ disk.frame - 0.6.0 + 0.7 diff --git a/docs/reference/move_to.html b/docs/reference/move_to.html index d4211c0b..b28af7b5 100644 --- a/docs/reference/move_to.html +++ b/docs/reference/move_to.html @@ -17,7 +17,7 @@ disk.frame - 0.6.0 + 0.7 diff --git a/docs/reference/nchunks.html b/docs/reference/nchunks.html index 9321954a..f9a52a07 100644 --- a/docs/reference/nchunks.html +++ b/docs/reference/nchunks.html @@ -17,7 +17,7 @@ disk.frame - 0.6.0 + 0.7 diff --git a/docs/reference/ncol_nrow.html b/docs/reference/ncol_nrow.html index 5b5d239a..b5d845bb 100644 --- a/docs/reference/ncol_nrow.html +++ b/docs/reference/ncol_nrow.html @@ -17,7 +17,7 @@ disk.frame - 0.6.0 + 0.7 diff --git a/docs/reference/one-stage-group-by-verbs.html b/docs/reference/one-stage-group-by-verbs.html index ac3c5abf..bed4cc3e 100644 --- a/docs/reference/one-stage-group-by-verbs.html +++ b/docs/reference/one-stage-group-by-verbs.html @@ -19,7 +19,7 @@ disk.frame - 0.6.0 + 0.7 diff --git a/docs/reference/overwrite_check.html b/docs/reference/overwrite_check.html index a33eaeb4..67a2b150 100644 --- a/docs/reference/overwrite_check.html +++ b/docs/reference/overwrite_check.html @@ -17,7 +17,7 @@ disk.frame - 0.6.0 + 0.7 diff --git a/docs/reference/partition_filter.html b/docs/reference/partition_filter.html new file mode 100644 index 00000000..14d62333 --- /dev/null +++ b/docs/reference/partition_filter.html @@ -0,0 +1,133 @@ + +Filter the dataset based on folder partitions — partition_filter • disk.frame + + +
+
+ + + +
+
+ + +
+

Filter the dataset based on folder partitions

+
+ +
+
partition_filter(x, ...)
+
+ +
+

Arguments

+
x
+

a disk.frame

+
...
+

filtering conditions for filtering the disk.frame at (folder) partition level

+
+ +
+ +
+ + +
+ +
+

Site built with pkgdown 2.0.2.

+
+ +
+ + + + + + + + diff --git a/docs/reference/play.html b/docs/reference/play.html index 286b1855..1fd2f784 100644 --- a/docs/reference/play.html +++ b/docs/reference/play.html @@ -17,7 +17,7 @@ disk.frame - 0.6.0 + 0.7 diff --git a/docs/reference/print.disk.frame.html b/docs/reference/print.disk.frame.html index 76b587e8..d39140bf 100644 --- a/docs/reference/print.disk.frame.html +++ b/docs/reference/print.disk.frame.html @@ -17,7 +17,7 @@ disk.frame - 0.6.0 + 0.7 diff --git a/docs/reference/pull.disk.frame.html b/docs/reference/pull.disk.frame.html index 172cb910..41b02d1a 100644 --- a/docs/reference/pull.disk.frame.html +++ b/docs/reference/pull.disk.frame.html @@ -17,7 +17,7 @@ disk.frame - 0.6.0 + 0.7 diff --git a/docs/reference/purrr_as_mapper.html b/docs/reference/purrr_as_mapper.html index 4e334c11..5a118b61 100644 --- a/docs/reference/purrr_as_mapper.html +++ b/docs/reference/purrr_as_mapper.html @@ -17,7 +17,7 @@ disk.frame - 0.6.0 + 0.7 diff --git a/docs/reference/rbindlist.disk.frame.html b/docs/reference/rbindlist.disk.frame.html index b50ff95f..4ca1ab33 100644 --- a/docs/reference/rbindlist.disk.frame.html +++ b/docs/reference/rbindlist.disk.frame.html @@ -17,7 +17,7 @@ disk.frame - 0.6.0 + 0.7 diff --git a/docs/reference/rechunk.html b/docs/reference/rechunk.html index b5f2b78c..1e7fc40a 100644 --- a/docs/reference/rechunk.html +++ b/docs/reference/rechunk.html @@ -17,7 +17,7 @@ disk.frame - 0.6.0 + 0.7 @@ -126,8 +126,8 @@

Examples

# re-chunking cars.df to 3 chunks, done "in-place" to the same folder as cars.df rechunk(cars.df, 3) -#> files have been backed up to temporary dir C:\Users\RTX2080\AppData\Local\Temp\RtmpyknGIm\back_up_tmp_dir471815f02658. You can recover there files until you restart your R session -#> path: "C:\Users\RTX2080\AppData\Local\Temp\RtmpyknGIm\file471862cf5821.df" +#> files have been backed up to temporary dir C:\Users\RTX2080\AppData\Local\Temp\RtmpUX5lOH\back_up_tmp_dir1c382d2e21d6. You can recover there files until you restart your R session +#> path: "C:\Users\RTX2080\AppData\Local\Temp\RtmpUX5lOH\file1c3824aa62c4.df" #> nchunks: 3 #> nrow (at source): 50 #> ncol (at source): 2 diff --git a/docs/reference/recommend_nchunks.html b/docs/reference/recommend_nchunks.html index dd1c227b..f1a9a12e 100644 --- a/docs/reference/recommend_nchunks.html +++ b/docs/reference/recommend_nchunks.html @@ -18,7 +18,7 @@ disk.frame - 0.6.0 + 0.7 diff --git a/docs/reference/remove_chunk.html b/docs/reference/remove_chunk.html index 232f00e1..8fbb459d 100644 --- a/docs/reference/remove_chunk.html +++ b/docs/reference/remove_chunk.html @@ -17,7 +17,7 @@ disk.frame - 0.6.0 + 0.7 @@ -116,7 +116,7 @@

Examples

# removes 3rd chunk remove_chunk(cars.df, 3) -#> path: "C:\Users\RTX2080\AppData\Local\Temp\RtmpyknGIm\file47184b42308d.df" +#> path: "C:\Users\RTX2080\AppData\Local\Temp\RtmpUX5lOH\file1c3821d32360.df" #> nchunks: 3 #> nrow (at source): 37 #> ncol (at source): 2 @@ -125,7 +125,7 @@

Examples

# removes 4th chunk remove_chunk(cars.df, "4.fst") -#> path: "C:\Users\RTX2080\AppData\Local\Temp\RtmpyknGIm\file47184b42308d.df" +#> path: "C:\Users\RTX2080\AppData\Local\Temp\RtmpUX5lOH\file1c3821d32360.df" #> nchunks: 2 #> nrow (at source): 26 #> ncol (at source): 2 @@ -134,7 +134,7 @@

Examples

# removes 2nd chunk remove_chunk(cars.df, file.path(attr(cars.df, "path", exact=TRUE), "2.fst"), full.names = TRUE) -#> path: "C:\Users\RTX2080\AppData\Local\Temp\RtmpyknGIm\file47184b42308d.df" +#> path: "C:\Users\RTX2080\AppData\Local\Temp\RtmpUX5lOH\file1c3821d32360.df" #> nchunks: 1 #> nrow (at source): 13 #> ncol (at source): 2 diff --git a/docs/reference/sample.html b/docs/reference/sample.html index 056cef09..45c890af 100644 --- a/docs/reference/sample.html +++ b/docs/reference/sample.html @@ -17,7 +17,7 @@ disk.frame - 0.6.0 + 0.7 @@ -127,28 +127,28 @@

Examples

collect(sample_frac(cars.df, 0.5)) #> speed dist -#> 1: 7 4 -#> 2: 4 2 -#> 3: 10 18 -#> 4: 10 34 +#> 1: 7 22 +#> 2: 10 18 +#> 3: 9 10 +#> 4: 8 16 #> 5: 11 17 #> 6: 13 34 -#> 7: 13 26 -#> 8: 12 28 -#> 9: 15 26 -#> 10: 13 46 -#> 11: 16 32 -#> 12: 15 20 -#> 13: 18 42 -#> 14: 17 50 -#> 15: 19 36 -#> 16: 18 76 -#> 17: 20 56 -#> 18: 20 64 -#> 19: 23 54 -#> 20: 20 52 -#> 21: 24 93 -#> 22: 25 85 +#> 7: 13 34 +#> 8: 12 14 +#> 9: 14 80 +#> 10: 15 54 +#> 11: 14 36 +#> 12: 16 32 +#> 13: 17 50 +#> 14: 18 42 +#> 15: 18 56 +#> 16: 18 84 +#> 17: 20 52 +#> 18: 20 32 +#> 19: 20 64 +#> 20: 19 46 +#> 21: 24 92 +#> 22: 24 70 #> speed dist # clean up cars.df diff --git a/docs/reference/setup_disk.frame.html b/docs/reference/setup_disk.frame.html index 6d502bac..a2994988 100644 --- a/docs/reference/setup_disk.frame.html +++ b/docs/reference/setup_disk.frame.html @@ -17,7 +17,7 @@ disk.frame - 0.6.0 + 0.7 diff --git a/docs/reference/shard.html b/docs/reference/shard.html index 7b598974..f419f59c 100644 --- a/docs/reference/shard.html +++ b/docs/reference/shard.html @@ -18,7 +18,7 @@ disk.frame - 0.6.0 + 0.7 diff --git a/docs/reference/shardkey.html b/docs/reference/shardkey.html index 2fce89e2..f4553236 100644 --- a/docs/reference/shardkey.html +++ b/docs/reference/shardkey.html @@ -17,7 +17,7 @@ disk.frame - 0.6.0 + 0.7 diff --git a/docs/reference/shardkey_equal.html b/docs/reference/shardkey_equal.html index dbc9bcce..7b013cd6 100644 --- a/docs/reference/shardkey_equal.html +++ b/docs/reference/shardkey_equal.html @@ -17,7 +17,7 @@ disk.frame - 0.6.0 + 0.7 diff --git a/docs/reference/show_ceremony.html b/docs/reference/show_ceremony.html index d1d69c51..8df3ffd8 100644 --- a/docs/reference/show_ceremony.html +++ b/docs/reference/show_ceremony.html @@ -17,7 +17,7 @@ disk.frame - 0.6.0 + 0.7 diff --git a/docs/reference/split_string_into_df.html b/docs/reference/split_string_into_df.html new file mode 100644 index 00000000..9a65c61c --- /dev/null +++ b/docs/reference/split_string_into_df.html @@ -0,0 +1,131 @@ + +Turn a string of the form /partion1=val/partion2=val2 into data.frame — split_string_into_df • disk.frame + + +
+
+ + + +
+
+ + +
+

Turn a string of the form /partion1=val/partion2=val2 into data.frame

+
+ +
+
split_string_into_df(path_strs)
+
+ +
+

Arguments

+
path_strs
+

The paths in string form to break into partition format

+
+ +
+ +
+ + +
+ + + + + + + + diff --git a/docs/reference/srckeep.html b/docs/reference/srckeep.html index 8bfaee95..5b927dbd 100644 --- a/docs/reference/srckeep.html +++ b/docs/reference/srckeep.html @@ -17,7 +17,7 @@ disk.frame - 0.6.0 + 0.7 diff --git a/docs/reference/sub-.disk.frame.html b/docs/reference/sub-.disk.frame.html index 3c758a26..9023b0d1 100644 --- a/docs/reference/sub-.disk.frame.html +++ b/docs/reference/sub-.disk.frame.html @@ -17,7 +17,7 @@ disk.frame - 0.6.0 + 0.7 diff --git a/docs/reference/tbl_vars.disk.frame.html b/docs/reference/tbl_vars.disk.frame.html index 1df5aeaf..6a51358a 100644 --- a/docs/reference/tbl_vars.disk.frame.html +++ b/docs/reference/tbl_vars.disk.frame.html @@ -18,7 +18,7 @@ disk.frame - 0.6.0 + 0.7 diff --git a/docs/reference/write_disk.frame.html b/docs/reference/write_disk.frame.html index 6eda846a..b57289cc 100644 --- a/docs/reference/write_disk.frame.html +++ b/docs/reference/write_disk.frame.html @@ -18,7 +18,7 @@ disk.frame - 0.6.0 + 0.7 @@ -105,10 +105,8 @@

Write disk.frame to disk

recommend_nchunks(diskf)), overwrite = FALSE, shardby = NULL, + partitionby = NULL, compress = 50, - shardby_function = "hash", - sort_splits = NULL, - desc_vars = NULL, ... ) @@ -127,14 +125,10 @@

Arguments

overwrite output directory

shardby

the columns to shard by

+
partitionby
+

the columns to (folder) partition by

compress

compression ratio for fst files

-
shardby_function
-

splitting of chunks: "hash" for hash function or "sort" for semi-sorted chunks

-
sort_splits
-

for the "sort" shardby function, a dataframe with the split values.

-
desc_vars
-

for the "sort" shardby function, the variables to sort descending.

...

passed to cmap.disk.frame

diff --git a/docs/reference/zip_to_disk.frame.html b/docs/reference/zip_to_disk.frame.html index fe4335d3..9a170ce9 100644 --- a/docs/reference/zip_to_disk.frame.html +++ b/docs/reference/zip_to_disk.frame.html @@ -20,7 +20,7 @@ disk.frame - 0.6.0 + 0.7 @@ -144,7 +144,7 @@

Examples

# read every file and convert it to a disk.frame zip.df = zip_to_disk.frame(zipfile, tempfile(fileext = ".df")) -#> Error in unzip(zipfile, list = TRUE): zip file 'C:\Users\RTX2080\AppData\Local\Temp\RtmpyknGIm\file471855d8254c.zip' cannot be opened +#> Error in unzip(zipfile, list = TRUE): zip file 'C:\Users\RTX2080\AppData\Local\Temp\RtmpUX5lOH\file1c3872483ca5.zip' cannot be opened # there is only one csv file so it return a list of one disk.frame zip.df[[1]] diff --git a/docs/sitemap.xml b/docs/sitemap.xml index 5df92161..b50973ee 100644 --- a/docs/sitemap.xml +++ b/docs/sitemap.xml @@ -171,6 +171,9 @@ /reference/evalparseglue.html + + /reference/find_globals_recursively.html + /reference/foverlaps.disk.frame.html @@ -183,6 +186,9 @@ /reference/get_chunk_ids.html + + /reference/get_partition_paths.html + /reference/groups.disk.frame.html @@ -240,6 +246,9 @@ /reference/overwrite_check.html + + /reference/partition_filter.html + /reference/play.html @@ -282,6 +291,9 @@ /reference/show_ceremony.html + + /reference/split_string_into_df.html + /reference/srckeep.html diff --git a/man/disk.frame_to_parquet.Rd b/man/disk.frame_to_parquet.Rd new file mode 100644 index 00000000..559a9a7f --- /dev/null +++ b/man/disk.frame_to_parquet.Rd @@ -0,0 +1,16 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/disk.frame-to-parquet.r +\name{disk.frame_to_parquet} +\alias{disk.frame_to_parquet} +\title{A function to convert a disk.frame to parquet format} +\usage{ +disk.frame_to_parquet(df, outdir) +} +\arguments{ +\item{df}{a disk.frame or a path to a disk.frame} + +\item{outdir}{the path to save the parquet files} +} +\description{ +A function to convert a disk.frame to parquet format +} diff --git a/man/find_globals_recursively.Rd b/man/find_globals_recursively.Rd new file mode 100644 index 00000000..2d7bc7fb --- /dev/null +++ b/man/find_globals_recursively.Rd @@ -0,0 +1,16 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/util.r +\name{find_globals_recursively} +\alias{find_globals_recursively} +\title{Find globals in an expression by searching through the chain} +\usage{ +find_globals_recursively(code, envir) +} +\arguments{ +\item{code}{An expression to search for globals} + +\item{envir}{The environment from which to begin the search} +} +\description{ +Find globals in an expression by searching through the chain +} diff --git a/man/get_chunk.Rd b/man/get_chunk.Rd index a4b4b796..86463463 100644 --- a/man/get_chunk.Rd +++ b/man/get_chunk.Rd @@ -7,7 +7,7 @@ \usage{ get_chunk(...) -\method{get_chunk}{disk.frame}(df, n, keep = NULL, full.names = FALSE, ...) +\method{get_chunk}{disk.frame}(df, n, keep = NULL, full.names = FALSE, ..., partitioned_info = NULL) } \arguments{ \item{...}{passed to fst::read_fst or whichever read function is used in the backend} @@ -19,6 +19,8 @@ get_chunk(...) \item{keep}{the columns to keep} \item{full.names}{whether n is the full path to the chunks or just a relative path file name. Ignored if n is numeric} + +\item{partitioned_info}{for internal use only. It's a data.frame used to help with filtering by partitions} } \description{ Obtain one chunk by chunk id diff --git a/man/get_partition_paths.Rd b/man/get_partition_paths.Rd new file mode 100644 index 00000000..33bb5d39 --- /dev/null +++ b/man/get_partition_paths.Rd @@ -0,0 +1,15 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/get_partition.r +\name{get_partition_paths} +\alias{get_partition_paths} +\title{Get the partitioning structure of a folder} +\usage{ +get_partition_paths(df) +} +\arguments{ +\item{df}{a disk.frame whose paths will be used to determine if it's +folder-partitioned disk.frame} +} +\description{ +Get the partitioning structure of a folder +} diff --git a/man/partition_filter.Rd b/man/partition_filter.Rd new file mode 100644 index 00000000..14a93a4e --- /dev/null +++ b/man/partition_filter.Rd @@ -0,0 +1,16 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/partition-filter.r +\name{partition_filter} +\alias{partition_filter} +\title{Filter the dataset based on folder partitions} +\usage{ +partition_filter(x, ...) +} +\arguments{ +\item{x}{a disk.frame} + +\item{...}{filtering conditions for filtering the disk.frame at (folder) partition level} +} +\description{ +Filter the dataset based on folder partitions +} diff --git a/man/split_string_into_df.Rd b/man/split_string_into_df.Rd new file mode 100644 index 00000000..c18f63c5 --- /dev/null +++ b/man/split_string_into_df.Rd @@ -0,0 +1,14 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/get_partition.r +\name{split_string_into_df} +\alias{split_string_into_df} +\title{Turn a string of the form /partion1=val/partion2=val2 into data.frame} +\usage{ +split_string_into_df(path_strs) +} +\arguments{ +\item{path_strs}{The paths in string form to break into partition format} +} +\description{ +Turn a string of the form /partion1=val/partion2=val2 into data.frame +} diff --git a/man/write_disk.frame.Rd b/man/write_disk.frame.Rd index 60c2bf09..c152c851 100644 --- a/man/write_disk.frame.Rd +++ b/man/write_disk.frame.Rd @@ -12,10 +12,8 @@ write_disk.frame( recommend_nchunks(diskf)), overwrite = FALSE, shardby = NULL, + partitionby = NULL, compress = 50, - shardby_function = "hash", - sort_splits = NULL, - desc_vars = NULL, ... ) @@ -32,13 +30,9 @@ output_disk.frame(...) \item{shardby}{the columns to shard by} -\item{compress}{compression ratio for fst files} - -\item{shardby_function}{splitting of chunks: "hash" for hash function or "sort" for semi-sorted chunks} +\item{partitionby}{the columns to (folder) partition by} -\item{sort_splits}{for the "sort" shardby function, a dataframe with the split values.} - -\item{desc_vars}{for the "sort" shardby function, the variables to sort descending.} +\item{compress}{compression ratio for fst files} \item{...}{passed to cmap.disk.frame} } diff --git a/manuscript/Book.txt b/manuscript/Book.txt deleted file mode 100644 index 08609b2e..00000000 --- a/manuscript/Book.txt +++ /dev/null @@ -1,2 +0,0 @@ -taming-larger-than-ram-data-course.txt - diff --git a/manuscript/taming-larger-than-ram-data-course.txt b/manuscript/taming-larger-than-ram-data-course.txt deleted file mode 100644 index 864097c7..00000000 --- a/manuscript/taming-larger-than-ram-data-course.txt +++ /dev/null @@ -1,2 +0,0 @@ -# Introduction -This course will teach you all the ins and outs of {disk.frame}. The instructor is the author of {disk.frame}. Enjoy! diff --git a/presentation/.gitignore b/presentation/.gitignore deleted file mode 100644 index 0586a46d..00000000 --- a/presentation/.gitignore +++ /dev/null @@ -1,2 +0,0 @@ -*.pptx -*.pdf diff --git a/presentation/rstudio conf 2021/1min video script b/presentation/rstudio conf 2021/1min video script deleted file mode 100644 index 53337167..00000000 --- a/presentation/rstudio conf 2021/1min video script +++ /dev/null @@ -1,11 +0,0 @@ -Hi my name is ZedJ and I am a Data Scientist local to Melbourne. I am a keen contributor to open source data science projects, one of which I want to talk about at rstudio:conf 2021. That project is {disk.frame} - a larger-than-RAM data manipulation package. - -R needs to load the data in its entirety into RAM. However, RAM is a precious resource and often do run out. - -{disk.frame} solves this issue by providing a 100%-R framework to manipulate data on disk. A modern laptop with {disk.frame} can comfortably handle 100GB's of data. - -Also, {disk.frame} uses {dplyr} verbs to manipulate data so useRs will find it very easy to pick up. - -Finally, because {disk.frame} is 100%-R, you can use any R package with it at no extra cost unlike Spark. - -The talk I propose will introduce {disk.frame} to users with the needs to manipulate large amounts of data with minimal setup. They will find {disk.frame} very familiar, as {disk.frame} uses {dplyr} verbs directly;. Some users rely on DBMS (e.g. PostgresSQL), Spark, or SAS to manage their large datasets. They will find lots of benefits in switching to {disk.frame}, which will allow them to keep their workflow in R for as long as possible. Because {disk.frame} can run R functions natively, they will find that {disk.frame} allows them to many R packages directly with {disk.frame}. diff --git a/presentation/rstudio conf 2021/Abstract proposal.md b/presentation/rstudio conf 2021/Abstract proposal.md deleted file mode 100644 index e7406d3f..00000000 --- a/presentation/rstudio conf 2021/Abstract proposal.md +++ /dev/null @@ -1,5 +0,0 @@ -Learn how to handle 100GBs of data with ease using {disk.frame} - the larger-than-RAM-data manipulation package. - -R loads data in its entirety into RAM. However, RAM is a precious resource and often do run out. That's why most R user would have run into the "cannot allocate vector of size xxB." error at some point. - -However, the need to handle larger-than-RAM data doesn't go away just because RAM isn't large enough. So many useRs turn to big data tools like Spark for the task. In this talk, I will make the case that {disk.frame} is sufficient and often preferable for manipulating larger-than-RAM data that fit on disk. I will show how you can apply familiar {dplyr}-verbs to manipulate larger-than-RAM data with {disk.frame}. \ No newline at end of file diff --git a/presentation/twin cities/.gitignore b/presentation/twin cities/.gitignore deleted file mode 100644 index 26416673..00000000 --- a/presentation/twin cities/.gitignore +++ /dev/null @@ -1 +0,0 @@ -*.mp4 diff --git a/presentation/user 2020/user-2020-tut-proposal.Rmd b/presentation/user 2020/user-2020-tut-proposal.Rmd deleted file mode 100644 index 83452169..00000000 --- a/presentation/user 2020/user-2020-tut-proposal.Rmd +++ /dev/null @@ -1,103 +0,0 @@ ---- -title: "Easy Larger-than-RAM data manipulation with disk.frame - useR! 2020 tutorial proposal" -author: "ZJ" -date: "11/25/2019" -output: pdf_document ---- - -```{r setup, include=FALSE} -knitr::opts_chunk$set(echo = TRUE) -``` - -## Title - -Handling 100GBs of data with ease - Larger-than-RAM data manipulation with disk.frame - -## Audience - -The tutorial aims to introduce {disk.frame} to users with the needs to manipulate large amounts of data. The audience will benefit the most from this tutorial if they are already familiar with popular R data frameworks such as dplyr and {data.table} because {disk.frame} uses dplyr and data.table primitives like `select`, and `dt[i, j, by]`. - -Some users rely on DBMS (e.g. PostgresSQL), Spark, or SAS to manage their large dataset. They will find tremendous benefit in switching to {disk.frame}, which will allow them to keep their workflow in R for as long as possible. - -## Instructor background - -I have more than ten years of experience using R and enterprise data manipulation tools like SAS and SQL. I have experience teaching Data Science courses in R in a group setting. I am the author of the {disk.frame} package, hence I have intimate knowledge of the inner workings of {disk.frame} and is, therefore, well-positioned to teach {disk.frame} for larger-than-RAM data manipulation. - - -The website I created for {disk.frame} is -* https://diskframe.com and - -I have given a useR! 2019 talk on {disk.frame}, see -* https://www.youtube.com/watch?v=3XMTyi_H4q4&t=1s - -My LinkedIn -* https://www.linkedin.com/in/daizj/ - -Here is an R course I wrote to Data Science in general -* https://github.com/xiaodaigh/intro_r_data_science - -## Domain - -Big Data manipulation - efficiently wrangle data that doesn't fit into RAM - - -## Points of appeal (75 words) -Every R user would have run into the "cannot allocate vector of size xxxB." error at some point. For most applications, R needs to load the data in its entirety into RAM. However, RAM is a precious resource and often do run out. - -{disk.frame} solves this issue by providing a framework to manipulate data on disk and minimize RAM usage. By using {disk.frame}, the user can perform analysis on much larger data than is normally possible with R. A modern laptop with {disk.frame} can comfortably handle 100GB's of data. - - -## Learning objectives: 100 – 200 words -In this tutorial, the user will learn how to use {disk.frame} effectively to manipulate datasets up to 100GBs in size on a single computer. - -After the tutorial, the user should be able to - -* learn what the pros and cons of common "big data" technologies like DBMS, Spark, and {disk.frame} are -* understand how {disk.frame} can help with manipulating large amounts of data -* understand when to use {disk.frame} and when not to -* confidently use {disk.frame} for any data manipulation task and not worry about running out of RAM -* load large datasets into {disk.frame} format -* manipulate large {disk.frame} files -* summarize large {disk.frame} files - -## Computing requirement -The requirement for {disk.frame} is R 3.4 and the ability to install packages via `install.packages`. A laptop is advised but not necessary. - -If attendees wish to follow along, I recommend that they should have RStudio or Jupyter notebook set up. However, that is not necessary as I will running through code that they can obtain from Github. - -## Teaching Assistant -I don't think Teaching Assistants are necessary. Installing disk.frame is simply running `install.packages("disk.frame")`. - -However, anyone familiar with RStudio setup and installation of packages is welcome as Teaching Assistant. Any expertise in the installation and setting of Jupyter notebook or Jupyter Lab would be highly appreciated. - -## Lesson Plan -The structure will be very informal, and questions are encouraged at all points along the way. - -* Introduction to disk.frame; how it's structured and why it's fast -* Pros and cons of disk.frame; when to use disk.frame and when not to -* Follow along: -* generate a fake dataset that is reasonably large (so avoiding a large download from internet) -* Convert the dataset to CSV -* Show how to load the data from CSV to {disk.frame} -* Manipulate the {disk.frame} -* Do group-by and summary statistics with {disk.frame} -* show how dplyr and data.table syntax can operate on {disk.frame} -* Explain how to perform group-by efficiently -* Demonstrate how joining works for large {disk.frame}s - -**** break **** - -After the break, we will cover advanced {disk.frame} topics - -* Explain how to extend {disk.frame} with a custom function -* Explain how to tune {disk.frame} for maximum efficiency -* Show some convenience features for {disk.frame} in RStudio if time permits -* Conclusion and Q&A - -## Other Considerations -The user can consult https://diskframe.com. It is regularly updated with relevant {disk.frame} resources like articles and how-to-guides. - -The intention is to show how to manipulate larger-than-RAM data with {disk.frame}. However, we will not download any such data from the internet. Instead, we will use an existing function to generate such data. Hence, it will not pose any constraint on internet access nor speed. However, students wishing to follow along on laptops will need to install {disk.frame} via `install.packages("disk.frame")`. - - - diff --git a/tests/testthat/test-data-table.r b/tests/testthat/test-data-table.r index d1629450..fb58f324 100644 --- a/tests/testthat/test-data-table.r +++ b/tests/testthat/test-data-table.r @@ -45,10 +45,12 @@ test_that("data.table global vars", { } # Check function with data.table object - grep_storm_name(storms_dt, "^A") + a = grep_storm_name(storms_dt, "^A") # Check function with diskframe object - grep_storm_name(storms_df, "^A") + b = grep_storm_name(storms_df, "^A") + + expect_equal(a, b) }) teardown({ diff --git a/tests/testthat/test-dplyr-verbs.r b/tests/testthat/test-dplyr-verbs.r index 01314556..95ccd489 100644 --- a/tests/testthat/test-dplyr-verbs.r +++ b/tests/testthat/test-dplyr-verbs.r @@ -7,31 +7,31 @@ setup({ test_that("testing select", { b = disk.frame(file.path(tempdir(), "tmp_b_dv.df")) - - df = b %>% - select(a) %>% + + df = b %>% + select(a) %>% collect - + expect_equal(ncol(df), 1) }) test_that("testing rename", { b = disk.frame(file.path(tempdir(), "tmp_b_dv.df")) - - df = b %>% - rename(a_new_name = a) %>% + + df = b %>% + rename(a_new_name = a) %>% collect - + expect_setequal(colnames(df), c("a_new_name", "b")) }) test_that("testing filter", { b = disk.frame(file.path(tempdir(), "tmp_b_dv.df")) - - df = b %>% - filter(a <= 100, b <= 10) %>% + + df = b %>% + filter(a <= 100, b <= 10) %>% collect - + expect_setequal(nrow(df), 10) }) @@ -44,109 +44,118 @@ test_that("testing filter - global vars", { filter(a <= one_hundred, b <= 10) %>% collect + expect_setequal(nrow(df), 10) + + + # testing_env = new.env() + # + # assign("one_hundred", 100, testing_env) + # assign("b", disk.frame(file.path(tempdir(), "tmp_b_dv.df")), testing_env) + # + # eval(parse(text=code), envir = testing_env, enclos=emptyenv()) }) test_that("testing mutate", { b = disk.frame(file.path(tempdir(), "tmp_b_dv.df")) - - - df = b %>% - mutate(d = a + b) %>% + + + df = b %>% + mutate(d = a + b) %>% collect - + expect_setequal(sum(df$d), sum(df$a, df$b)) - - df = b %>% + + df = b %>% mutate(e = rank(desc(a))) %>% collect - + expect_equal(nrow(df), 100) - + # need to test value <- as.disk.frame(tibble(char = LETTERS, num = 1:26)) df2 = value %>% dplyr::mutate(b = case_when( char %in% c("A", "B", "C") ~ "1", - TRUE ~ char)) %>% + TRUE ~ char)) %>% collect - + expect_equal(ncol(df2), 3) - + # testing fn = function(a, b) { a+b } - + df3 = value %>% dplyr::mutate(b = fn(num, num)) %>% collect - + expect_equal(ncol(df3), 3) - - + + global_var = 100 - + df4 = value %>% dplyr::mutate(b = fn(num, num), d = global_var*2) %>% collect - + expect_equal(ncol(df4), 4) expect_true(all(df4$d == 200)) }) test_that("testing mutate user-defined function", { b = disk.frame(file.path(tempdir(), "tmp_b_dv.df")) - - + + udf = function(a1, b1) { a1 + b1 } - + df = b %>% mutate(d = udf(a,b)) %>% collect - + expect_setequal(sum(df$d), sum(df$a, df$b)) }) test_that("testing transmute", { b = disk.frame(file.path(tempdir(), "tmp_b_dv.df")) - - df = b %>% - transmute(d = a + b) %>% + + df = b %>% + transmute(d = a + b) %>% collect - + expect_setequal(names(df), c("d")) }) test_that("testing arrange", { b = disk.frame(file.path(tempdir(), "tmp_b_dv.df")) - + expect_warning(df <- b %>% - mutate(random_unif = runif(dplyr::n())) %>% + mutate(random_unif = runif(dplyr::n())) %>% arrange(desc(random_unif))) - + df <- b %>% - mutate(random_unif = runif(dplyr::n())) %>% + mutate(random_unif = runif(dplyr::n())) %>% chunk_arrange(desc(random_unif)) - + x = purrr::map_lgl(1:nchunks(df), ~{ is.unsorted(.x) == FALSE }) - + expect_true(all(x)) }) test_that("testing chunk_summarise", { b = disk.frame(file.path(tempdir(), "tmp_b_dv.df")) - + df = b %>% - chunk_summarise(suma = sum(a)) %>% - collect %>% + chunk_summarise(suma = sum(a)) %>% + collect %>% summarise(suma = sum(suma)) - + expect_equal(df$suma, collect(b)$a %>% sum) }) @@ -154,21 +163,36 @@ test_that("testing mutate within function works", { test_f <- function(params, x_df){ x_df %>% mutate(aha = params[1]*cyl + params[2]*disp) } - + expect_true("aha" %in% names(test_f(c(1, 2), mtcars))) }) test_that("filter failure: prevent github #191 regression", { flights_df = as.disk.frame(nycflights13::flights) - + # expect error due to syntax error - expect_error(flights_df %>% - filter(tailnum %in% paste0(unique(nycflights13::flights$tailnum)[1:60]), "") %>% + expect_error(flights_df %>% + filter(tailnum %in% paste0(unique(nycflights13::flights$tailnum)[1:60]), "") %>% collect) - + delete(flights_df) }) +test_that("testing {{}}", { + ok <- function(input_data, col) { + val = 2 + input_data %>% + mutate({{col}} + val, val2 = speed+dist) %>% + collect + } + + a = ok(as.disk.frame(cars), dist) + + b = ok(cars, dist) + + expect_equal(a, b) +}) + teardown({ fs::dir_delete(file.path(tempdir(), "tmp_b_dv.df")) diff --git a/tests/testthat/test-group-by.R b/tests/testthat/test-group-by.R index e61f75d3..e28b610b 100644 --- a/tests/testthat/test-group-by.R +++ b/tests/testthat/test-group-by.R @@ -299,6 +299,51 @@ test_that("tests for {{}}", { expect_equal(a, b) }) + +test_that("tests for global", { + # TODO make this work + val = 2 + val2 = 2 + + b = iris %>% + as.disk.frame %>% + group_by(as.integer(Species) + val, as.integer(Species) + val2) %>% + summarize(mean(Petal.Length)) %>% + collect + + a = iris %>% + group_by(as.integer(Species) + val, as.integer(Species) + val2) %>% + summarize(mean(Petal.Length)) %>% + collect + + expect_equal(b, a) + + b = iris %>% + as.disk.frame %>% + group_by(as.integer(Species) + val, as.integer(Species) + val2) %>% + summarize(mean(Petal.Length+val2)) %>% + collect + + a = iris %>% + group_by(as.integer(Species) + val, as.integer(Species) + val2) %>% + summarize(mean(Petal.Length+val2)) %>% + collect + + expect_equal(a, b) + + a = iris %>% + summarize(mean(Petal.Length+val2)) + + b = iris %>% + as.disk.frame %>% + summarize(mean(Petal.Length+val2)) %>% + collect + + expect_equal(names(a), names(b)) + + expect_equal(a[[1]], b[[1]]) +}) + teardown({ # fs::file_delete(file.path(tempdir(), "tmp_pls_delete_gb.csv")) # fs::dir_delete(file.path(tempdir(), "tmp_pls_delete_gb.df")) diff --git a/tests/testthat/test-names.r b/tests/testthat/test-names.r index 2d114aeb..6cc65022 100644 --- a/tests/testthat/test-names.r +++ b/tests/testthat/test-names.r @@ -12,13 +12,13 @@ test_that("testing names", { expect_setequal(names(b), c("a","b")) }) -test_that("testing names with lazyfn", { - b = disk.frame(file.path(tempdir(), "tmp_names.df")) %>% - mutate(d = a + b) - - expect_setequal(colnames(b), c("a","b", "d")) - expect_setequal(names(b), c("a","b", "d")) -}) +# test_that("testing names with lazyfn", { +# b = disk.frame(file.path(tempdir(), "tmp_names.df")) %>% +# mutate(d = a + b) +# +# expect_setequal(colnames(b), c("a","b", "d")) +# expect_setequal(names(b), c("a","b", "d")) +# }) teardown({ fs::dir_delete(file.path(tempdir(), "tmp_names.df")) diff --git a/tests/testthat/test-partitions.r b/tests/testthat/test-partitions.r new file mode 100644 index 00000000..3fb65f29 --- /dev/null +++ b/tests/testthat/test-partitions.r @@ -0,0 +1,24 @@ +context("test-partitions") + +setup({ + setup_disk.frame(workers = 1) +}) + +test_that("test partitions", { + a = as.disk.frame(cars) %>% + write_disk.frame(partitionby="speed", outdir=tempfile()) + + a = a %>% + partition_filter(speed < 10) %>% + collect %>% + arrange(speed, dist) + + + b = cars %>% + filter(speed < 10) %>% + arrange(speed, dist) + + for(n in names(a)) { + expect_equal(a %>% select(!!n), b %>% select(!!n)) + } +}) diff --git a/tutorials/0_feature_tools_drake.r b/tutorials/0_feature_tools_drake.r deleted file mode 100644 index 2ad73881..00000000 --- a/tutorials/0_feature_tools_drake.r +++ /dev/null @@ -1,20 +0,0 @@ -library(disk.frame) -setup_disk.frame() - -library(drake) - -plan = drake::drake_plan( - df = target( - disk.frame::csv_to_disk.frame( - file_in("D:/data/mortgage-risk/feature_matrix_cleaned.csv"), - outdir = drake::drake_tempfile() - ), - format = "diskframe" - ) -) - -system.time(make(plan)) - -loadd(df) - -df diff --git a/tutorials/1_feature_tools.r b/tutorials/1_feature_tools.r deleted file mode 100644 index e5bd3282..00000000 --- a/tutorials/1_feature_tools.r +++ /dev/null @@ -1,16 +0,0 @@ -library(disk.frame) -# this willl set disk.frame with multiple workers -setup_disk.frame() -# this will allow unlimited amount of data to be passed from worker to worker -options(future.globals.maxSize = Inf) - - -system.time( - a <- - csv_to_disk.frame( - "c:/data/feature_matrix_cleaned.csv", - in_chunk_size = 1e5/4 - ) -) - -system.time(rechunk(a, 16, outdir = "c:/data/ft.df", shardby="sk_id_curr")) diff --git a/tutorials/2_feature_tools.r b/tutorials/2_feature_tools.r deleted file mode 100644 index 88b3c081..00000000 --- a/tutorials/2_feature_tools.r +++ /dev/null @@ -1,67 +0,0 @@ -library(disk.frame) -library(data.table) -library(drake) -# this willl set disk.frame with multiple workers -setup_disk.frame() -# this will allow unlimited amount of data to be passed from worker to worker -options(future.globals.maxSize = Inf) - -loadd(gd) - -system.time(corgd = cor(gd)) - -library(data.table) - -system.time( - a <- - data.table::fread( - "C:/Users/ZJ.DAI/Documents/git/disk.frame/tutorials/gd.csv")) - - -if(nchunks(df) != 32) { - rechunk(df, 32, shardby= "sk_id_curr") -} - -# rechunk(df, 32) -plan <- drake::drake_plan( - df = disk.frame(file_in("d:/data/ft.df")), - df = { - if(nchunks(df) != 32) { - rechunk(df, 32, shardby= "sk_id_curr") - } - }, - bads = df %>% - filter(target == 1) %>% - collect(parallel=FALSE) -) - - - -system.time() - -sfrac = nrow(bads)/(nrow(df) - nrow(bads)) - -# sfrac = 0.07490269 -system.time(df1 <- df %>% - filter(target == 0) %>% - sample_frac(size = sfrac) %>% - collect(parallel = FALSE)) - -goods = df1 - -gd = rbindlist(list(bads, goods)) -gd - - -pryr::object_size(gd) -arrow::write_parquet(gd, "gd.parquet") -fst::write_fst(gd, "gd.fst") -data.table::fwrite(gd, "gd.csv") - -sample_frac(gd, 0.25) %>% - data.table::fwrite("gd.sample.csv") - -gd = fst::read_fst("gd.fst") -library(DataExplorer) -DataExplorer::create_report(gd) - diff --git a/tutorials/2_feature_tools_drake.r b/tutorials/2_feature_tools_drake.r deleted file mode 100644 index a8cb864c..00000000 --- a/tutorials/2_feature_tools_drake.r +++ /dev/null @@ -1,63 +0,0 @@ -library(disk.frame) -library(data.table) -library(drake) -library(DataExplorer) -# this willl set disk.frame with multiple workers -setup_disk.frame() -# this will allow unlimited amount of data to be passed from worker to worker -options(future.globals.maxSize = Inf) - -# rechunk(df, 32) -plan <- drake::drake_plan( - df1 = { - df1 = disk.frame(file_in("d:/data/ft.df")) - if(nchunks(df1) != 32) { - rechunk(df1, 32, shardby= "sk_id_curr") - } - df1 - }, - bads = df1 %>% - filter(target == 1) %>% - collect(parallel=FALSE), - sfrac = nrow(bads)/(nrow(df1) - nrow(bads)), - goods = - df1 %>% - filter(target == 0) %>% - sample_frac(size = sfrac) %>% - collect(parallel = FALSE), - gd = target( - rbindlist(list(bads, goods)), - format = "fst" - ), - arrow::write_parquet(gd, file_out("gd.parquet")), - data.table::fwrite(gd, file_out("gd.csv")), - gd_sample = sample_frac(gd, 0.25), - data.table::fwrite(gd_sample, file_out("gd.sample.csv"))#, - # intro_plot = DataExplorer::create_report( - # gd, - # output_file = file_out("output/dataexplorer/plot_intro.html") - # ) -) - -make(plan) - -loadd(gd) - -intro_plot = DataExplorer::create_report( - gd, - output_file = file_out("output/dataexplorer/plot_intro.html") -) - -build_times(plan) -# -# profile_missing(gd) ->pmgd -# -# plot_histogram(pmgd) -# -# -# intro_plot = DataExplorer::create_report( -# gd, -# configure_cache() -# ) -# -# introduce(gd) diff --git a/tutorials/2_feature_tools_drake.rmd b/tutorials/2_feature_tools_drake.rmd deleted file mode 100644 index cdbdc84c..00000000 --- a/tutorials/2_feature_tools_drake.rmd +++ /dev/null @@ -1,60 +0,0 @@ -```{r} -library(disk.frame) -library(data.table) -library(drake) -library(DataExplorer) -# this willl set disk.frame with multiple workers -setup_disk.frame() -# this will allow unlimited amount of data to be passed from worker to worker -options(future.globals.maxSize = Inf) -``` - -```{r} -# rechunk(df, 32) -plan <- drake::drake_plan( - df1 = { - df1 = disk.frame(file_in("d:/data/ft.df")) - if(nchunks(df1) != 32) { - rechunk(df1, 32, shardby= "sk_id_curr") - } - df1 - }, - bads = df1 %>% - filter(target == 1) %>% - collect(parallel=FALSE), - sfrac = nrow(bads)/(nrow(df1) - nrow(bads)), - goods = - df1 %>% - filter(target == 0) %>% - sample_frac(size = sfrac) %>% - collect(parallel = FALSE), - gd = target( - rbindlist(list(bads, goods)), - format = "fst" - ), - arrow::write_parquet(gd, file_out("gd.parquet")), - data.table::fwrite(gd, file_out("gd.csv")), - gd_sample = sample_frac(gd, 0.25), - data.table::fwrite(gd_sample, file_out("gd.sample.csv")), - intro_plot = DataExplorer::create_report( - gd, - output_file = file_out("output/dataexplorer/plot_intro.html") - ) -) -system.time(make(plan)) -``` - -```{r} -print(getwd()) - -``` - - -```{r} -loadd(gd) -gd -``` - -``{r} -build_times(plan) -``` diff --git a/tutorials/2_feature_tools_no_plo.r b/tutorials/2_feature_tools_no_plo.r deleted file mode 100644 index a96011cb..00000000 --- a/tutorials/2_feature_tools_no_plo.r +++ /dev/null @@ -1,52 +0,0 @@ -library(disk.frame) -library(data.table) -library(drake) -library(DataExplorer) -# this willl set disk.frame with multiple workers -setup_disk.frame() -# this will allow unlimited amount of data to be passed from worker to worker -options(future.globals.maxSize = Inf) - -# rechunk(df, 32) -plan <- drake::drake_plan( - df1 = { - df1 = disk.frame(file_in("d:/data/ft.df")) - if(nchunks(df1) != 32) { - rechunk(df1, 32, shardby= "sk_id_curr") - } - df1 - }, - bads = df1 %>% - filter(target == 1) %>% - collect(parallel=FALSE), - sfrac = nrow(bads)/(nrow(df1) - nrow(bads)), - goods = - df1 %>% - filter(target == 0) %>% - sample_frac(size = sfrac) %>% - collect(parallel = FALSE), - gd = target( - rbindlist(list(bads, goods)), - format = "fst" - ), - arrow::write_parquet(gd, file_out("gd.parquet")), - data.table::fwrite(gd, file_out("gd.csv")), - gd_sample = sample_frac(gd, 0.25), - data.table::fwrite(gd_sample, file_out("gd.sample.csv")) -) - -make(plan) - -loadd(gd) -# -# profile_missing(gd) ->pmgd -# -# plot_histogram(pmgd) -# -# -# intro_plot = DataExplorer::create_report( -# gd, -# configure_cache() -# ) -# -# introduce(gd) diff --git a/tutorials/30-g.R b/tutorials/30-g.R deleted file mode 100644 index 7b5a9ee7..00000000 --- a/tutorials/30-g.R +++ /dev/null @@ -1,49 +0,0 @@ -library(disk.frame) - -setup_disk.frame(12) - -a = disk.frame("c:/data/airontimecsv.df/") - -system.time(b <- a %>% - group_by(YEAR, MONTH, DAY_OF_MONTH) %>% - summarise(sum(DEP_DELAY)) %>% - collect) - - -path_to_data <- "c:/data/" -rows = 148619656 -recommended_nchunks = recommend_nchunks(file.size(file.path(path_to_data, "combined.csv"))) -in_chunk_size = ceiling(rows/ recommended_nchunks) - -path_to_data = "c:/data/AirOnTimeCSV/" -#path_to_data = "d:/data/" -system.time(a <- csv_to_disk.frame( - list.files(path_to_data, pattern = ".csv$", full.names = TRUE), - outdir = file.path("c:/data/", "airontimecsv.df"), - colClasses = list(character = c("WHEELS_OFF", "WHEELS_ON")) -)) - - -system.time(flights.df <- csv_to_disk.frame( - paste0(path_to_data, "combined.csv"), - outdir = paste0(path_to_data, "combined.laf.df"), - in_chunk_size = in_chunk_size, - backend = "LaF" -)) - -system.time(a <- csv_to_disk.frame( - file.path(path_to_data, "combined.csv"), - outdir = file.path(path_to_data, "combined.readr.df"), - in_chunk_size = in_chunk_size, - colClasses = list(character = c("WHEELS_OFF","WHEELS_ON")), - chunk_reader = "readr" -)) - -system.time(a <- csv_to_disk.frame( - file.path(path_to_data, "combined.csv"), - outdir = file.path(path_to_data, "combined.readLines.df"), - in_chunk_size = in_chunk_size, - colClasses = list(character = c("WHEELS_OFF","WHEELS_ON")), - chunk_reader = "readLines" -)) - diff --git a/tutorials/bigreadr.R b/tutorials/bigreadr.R deleted file mode 100644 index c2deac1e..00000000 --- a/tutorials/bigreadr.R +++ /dev/null @@ -1,59 +0,0 @@ -library(disk.frame) - -setup_disk.frame() - -system.time(b <- csv_to_disk.frame( - "c:/data/combined.csv", - colClasses = list(character = c("WHEELS_ON", "WHEELS_OFF")), - in_chunk_size = 4e6, - chunk_reader = "bigreadr" -)) - -system.time(nl <- bigreadr::nlines("c:/data/combined.csv")) -system.time(a <- bigreadr::split_file( - "c:/data/combined.csv", - every_nlines = 4e6, - repeat_header = TRUE)) - -system.time(b <- csv_to_disk.frame( - bigreadr::get_split_files(a), - header = TRUE, - colClasses = list(character = c("WHEELS_ON", "WHEELS_OFF")) -)) - - - -#' system(' -#' @echo off -#' Setlocal EnableDelayedExpansion -#' for /f "usebackq" %%a in (`dir /b %1`) do ( -#' for /f "usebackq" %%b in (`type %%a ^| find "" /v /c`) do ( -#' set /a lines += %%b -#' ) -#' ) -#' echo %lines% -#' endlocal -#' -#' CountLines "c:\data\combined.csv" -#' ') - -a = disk.frame(file.path(path_to_data, "airontimecsv.df")) - - -system.time(r_mean_del_delay <- a %>% - group_by(YEAR, MONTH, DAY_OF_MONTH) %>% - summarise(sum_delay = sum(DEP_DELAY, na.rm = TRUE), n = n()) %>% - collect %>% - group_by(YEAR, MONTH, DAY_OF_MONTH) %>% - summarise(mean_delay = sum(sum_delay)/sum(n))) - - -library(lubridate) - -dep_delay = r_mean_del_delay %>% - arrange(YEAR, MONTH, DAY_OF_MONTH) %>% - mutate(date = ymd(paste(YEAR, MONTH, DAY_OF_MONTH, sep = "-"))) - -library(ggplot2) -ggplot(dep_delay, aes(date, mean_delay)) + geom_smooth() - diff --git a/tutorials/cran-logs.r b/tutorials/cran-logs.r deleted file mode 100644 index 3628e513..00000000 --- a/tutorials/cran-logs.r +++ /dev/null @@ -1,78 +0,0 @@ -a = data.table::fread("c:/Users/ZJ.DAI/Downloads/a (1).csv") - -cran.stats::read_logs(start=as.Date("2019-08-27"), as.Date("2019-08-27"), dir = "d:/data/cran-logs") - -library(dplyr) -library(data.table) -a[,date:=as.Date(date, "%Y-%m-%d")] - -a1 = a %>% - # filter(N < 100) %>% - right_join(data.frame(date = seq(min(a$date), max(a$date), by="day"))) - -setDT(a1) -a1[,N := ifelse(is.na(N), (lag(N) + lead(N))/2, N)] -a2 = a1[!is.na(N)] -fwrite(a2, "c:/data/a1.csv", row.names = FALSE) - -a3 = cbind(a2, a2[,shift(N,-3:3, type = "lead")] %>% - apply(1,mean)) %>% - filter(!is.na(V2)) %>% - select(-N) -fwrite(a3, "c:/data/a3.csv", row.names = FALSE) - -plot(a3) - -library(pixapi) - -result = pixapi::get_pix_forecasts("https://ep-d5adc7ba-0218-44f2-9ed9-eb6d202784e8.serving.aiaengine.com/invocations", 14) -setDT(result) -a4 = rbind(a3, result[,.(date = as.Date(date), V2)]) -plot(a4) - -a4 %>% - full_join(a1) -> a5 - -write.csv(a5, "a5.csv") - - -rs = cran.stats::read_logs(start = as.Date("2019-09-26"), as.Date("2019-09-30"), dir = "cran-logs") - -library(drake) -library(disk.frame) -setup_disk.frame() - - - -plan <- drake_plan( - lf1 = list.files( - file_in("d:/data/cran-logs"), - pattern="*.csv", - full.names = T - ), - x = target( - disk.frame::csv_to_disk.frame( - lf1, - outdir = drake_tempfile()), - format = "diskframe" - ) -) - -make(plan) - -config <- drake_config(plan) -vis_drake_graph(config) - -x = readd(lf1) - -head(x) - -readd(x) -system.time( - df <- csv_to_disk.frame(list.files("d:/data/cran-logs", pattern="*.csv", full.names = T)) -) - -head(df) - -df = shard(df, shardby = "date") -df[package=="disk.frame",.N, date, keep=c("package","date")][order(date)] diff --git a/tutorials/fannie-mae-to-parquet/0-setup.r b/tutorials/fannie-mae-to-parquet/0-setup.r deleted file mode 100644 index dc5fd196..00000000 --- a/tutorials/fannie-mae-to-parquet/0-setup.r +++ /dev/null @@ -1,15 +0,0 @@ -library(disk.frame) -setup_disk.frame() - - -fm.df = disk.frame("c:/data/fannie_mae_disk_frame/fm.df/") - -fm.df %>% - mutate( - yr = substr(monthly.rpt.prd, 7, 11) %>% as.numeric, - mth = substr(monthly.rpt.prd, 1, 2) %>% as.numeric) %>% - rechunk(nchunks = nchunks(fm.df), outdir = "c:/data/fannie_mae_disk_frame/fm_by_yr_mth.df/", shardby = c("yr", "mth")) - - - - diff --git a/tutorials/fannie-mae-to-parquet/1-testing.r b/tutorials/fannie-mae-to-parquet/1-testing.r deleted file mode 100644 index 99008220..00000000 --- a/tutorials/fannie-mae-to-parquet/1-testing.r +++ /dev/null @@ -1,7 +0,0 @@ -library(disk.frame) -setup_disk.frame() - -df1 = disk.frame("c:/data/fannie_mae_disk_frame/fm_by_yr_mth.df/") -system.time(a <- df1[,.N, .(yr, mth), keep=c("yr", "mth")]) - -a diff --git a/tutorials/fannie_mae/00_setup.r b/tutorials/fannie_mae/00_setup.r deleted file mode 100644 index 1fb5ffcd..00000000 --- a/tutorials/fannie_mae/00_setup.r +++ /dev/null @@ -1,66 +0,0 @@ -library(glue) -library(purrr) -library(fst) -library(tidyr) -library(ggplot2) -library(stringr) -library(xgboost) -library(lubridate) -library(future.apply) -library(data.table) -library(disk.frame) - -setup_disk.frame() - - -raw_harp_data_path = "c:/data/fannie_mae/harp_files" -#raw_harp_data_path = "D:/data/fannie_mae/harp_files" - -#raw_perf_data_path = "c:/data/Performance_All/" - -## read the split data set is ALOT (6x) faster -raw_perf_data_path = "c:/data/Performance_All_split/" -#raw_perf_data_path = "d:/data/Performance_All/" - - -raw_harp_data_path = "c:/data/fannie_mae/harp_files" -#raw_harp_data_path = "D:/data/fannie_mae/harp_files" - -# level of compression from 1 to 100 where 100 is the highest level -compress = 50 - -# where the outputs go -#outpath = "c:/data/fannie_mae_disk_frame/" -outpath = "c:/data/fannie_mae_disk_frame/" - -#acqzip_file_path = "c:/data/Acquisition_All.zip" -acqzip_file_path = "c:/data/fannie_mae/Acquisition_All.zip" - -#appl_path = "C:/data/" -appl_path = "c:/data" - -Performance_ColClasses = - c("character", "character", "character", "numeric", "numeric", "numeric", "numeric", - "numeric", "character", "character", "character", "character", "character", "character", - "character", "character", "character", "numeric", "numeric", "numeric", "numeric", "numeric", - "numeric", "numeric", "numeric", "numeric", "numeric", "numeric", "character", "numeric", - "character") -Performance_Variables = - c("LOAN_ID", "Monthly.Rpt.Prd", "Servicer.Name", "LAST_RT", "LAST_UPB", "Loan.Age", - "Months.To.Legal.Mat" , "Adj.Month.To.Mat", "Maturity.Date", "MSA", "Delq.Status", - "MOD_FLAG", "Zero.Bal.Code", "ZB_DTE", "LPI_DTE", "FCC_DTE","DISP_DT", "FCC_COST", - "PP_COST", "AR_COST", "IE_COST", "TAX_COST", "NS_PROCS", "CE_PROCS", "RMW_PROCS", - "O_PROCS", "NON_INT_UPB", "PRIN_FORG_UPB_FHFA", "REPCH_FLAG", "PRIN_FORG_UPB_OTH", - "TRANSFER_FLG") %>% tolower - -dfiles = dir(raw_perf_data_path, full.names = T) -short_dfiles = dir(raw_perf_data_path) - -Acquisitions_Variables = c("LOAN_ID", "ORIG_CHN", "Seller.Name", "ORIG_RT", "ORIG_AMT", "ORIG_TRM", "ORIG_DTE" - ,"FRST_DTE", "OLTV", "OCLTV", "NUM_BO", "DTI", "CSCORE_B", "FTHB_FLG", "PURPOSE", "PROP_TYP" - ,"NUM_UNIT", "OCC_STAT", "STATE", "ZIP_3", "MI_PCT", "Product.Type", "CSCORE_C", "MI_TYPE", "RELOCATION_FLG") %>% tolower() - -Acquisition_ColClasses = c("character", "character", "character", "numeric", "numeric", "integer", "character", "character", "numeric", - "numeric", "character", "numeric", "numeric", "character", "character", "character", "character", "character", - "character", "character", "numeric", "character", "numeric", "numeric", "character") - diff --git a/tutorials/fannie_mae/01a_read_from_csv.r b/tutorials/fannie_mae/01a_read_from_csv.r deleted file mode 100644 index e97b7c65..00000000 --- a/tutorials/fannie_mae/01a_read_from_csv.r +++ /dev/null @@ -1,40 +0,0 @@ -source("tutorials/fannie_mae/00_setup.r") - -# number of rows to read in from each file in one go -nreadin = NULL - -# your ram size in G -# this is needed as memomry.limit seems broken -ram.size = 64 -nc = parallel::detectCores(logical = FALSE) -conservatism = 2 -minchunks = nc -# compression ratio, max = 100 for best compression but slower running speed -# compress = 100 - -# set up some variable for future use - -full_file_path = dir(raw_perf_data_path, full.names = T) - - -# randomise it to maximize the chance of a good load balancing -#set.seed(1) -#full_file_path = sample(full_file_path, length(full_file_path)) - -# convert CSV in parallel -pt <- proc.time() -tot_file_size = sum(file.size(full_file_path))/1024^3 -res = csv_to_disk.frame( - full_file_path[1:6], - outdir = file.path(outpath, "raw_fannie_mae"), - #shardby = "loan_id", - colClasses = Performance_ColClasses, - col.names = Performance_Variables, - sep = "|", - in_chunk_size = nreadin, - nchunks = max(round(tot_file_size/ram.size*nc)*nc*conservatism, nc), - overwrite = TRUE, - .progress = TRUE # show progress -) -print(data.table::timetaken(pt)) - diff --git a/tutorials/fannie_mae/01a_read_from_csv_SINGLE_THREADED.r b/tutorials/fannie_mae/01a_read_from_csv_SINGLE_THREADED.r deleted file mode 100644 index be535fed..00000000 --- a/tutorials/fannie_mae/01a_read_from_csv_SINGLE_THREADED.r +++ /dev/null @@ -1,37 +0,0 @@ -source("inst/fannie_mae/0_setup.r") - -# number of rows to read in -compress = 50 - -# how many chunks do we need? -relative_file_path = dir(raw_perf_data_path) -full_file_path = dir(raw_perf_data_path, full.names = T) - -file_sizes = purrr::map_dbl(full_file_path, ~file.size(.x)) - -# use the recommend_nchunks function to get a chunksize based on your RAM and -# number of CPU cores -nchunks = sum(file_sizes) %>% recommend_nchunks(type="csv") - -relative_file_path = relative_file_path[order(file_sizes, decreasing = T)] -full_file_path = full_file_path[order(file_sizes, decreasing = T)] - -l = length(full_file_path) - -pt <- proc.time() -system.time(lapply(1:l, function(i) { -#system.time(future_lapply(1:6, function(i) { - relative_file_pathi = relative_file_path[i] - full_file_path - csv_to_disk.frame( - full_file_path[i], - glue("test_fm/{relative_file_path[i]}"), - shardby="loan_id", - nchunks=nchunks, - colClasses = Performance_ColClasses, - col.names = Performance_Variables, - sep="|", - compress=compress, - overwrite = T) -})) -print(timetaken(pt)) \ No newline at end of file diff --git a/tutorials/fannie_mae/01d_OPTIONAL_rechunk.r b/tutorials/fannie_mae/01d_OPTIONAL_rechunk.r deleted file mode 100644 index 1ee3669c..00000000 --- a/tutorials/fannie_mae/01d_OPTIONAL_rechunk.r +++ /dev/null @@ -1,5 +0,0 @@ -library(disk.frame) - -fmdf <- disk.frame("fmdf") - -rechunk(fmdf, 128) diff --git a/tutorials/fannie_mae/01e_read_data_for_harp.r b/tutorials/fannie_mae/01e_read_data_for_harp.r deleted file mode 100644 index b9f1173c..00000000 --- a/tutorials/fannie_mae/01e_read_data_for_harp.r +++ /dev/null @@ -1,85 +0,0 @@ -#1c2 -source("tutorials/fannie_mae/00_setup.r") - -pt <- proc.time() - -# rows to read in one go -rows_to_read = 1e7 - -# load the Fannie Mae disk.frame -fmdf <- disk.frame(file.path(outpath, "fm.df")) - -#system.time(harp <- fread("C:/data/HARP_Files/Performance_HARP.txt", colClasses = Performance_ColClasses, col.names = Performance_Variables)) -harp_mapping <- fread(file.path(raw_harp_data_path,"Loan_Mapping.txt"), colClasses = "c", col.names = c("loan_id", "harp_id")) -setkey(harp_mapping, harp_id) -fst::write_fst(harp_mapping,"harp_mapping.fst.tmp") -fs::file_move("harp_mapping.fst.tmp", "harp_mapping.fst") -print("reading in and saving HARP mapping file took: ") -print(data.table::timetaken(pt)) - -# took about 438.65 on laptop 500 chunks -# took about 205 on desktop 56 chunks -pt <- proc.time() -system.time( - harp <- csv_to_disk.frame(file.path(raw_harp_data_path,"Performance_HARP.txt"), inmapfn = function(df) { - setnames(df, "loan_id", "harp_id") - - merge(df, harp_mapping, by="harp_id") - }, - nchunks = nchunks(fmdf), - in_chunk_size = rows_to_read, - shardby = "loan_id", - outdir = file.path(outpath, "harp.df"), - colClasses = Performance_ColClasses, - col.names = Performance_Variables, - sep="|")) - -print("reading in and saving HARP mapping file took: ") -print(data.table::timetaken(pt)) - - -if(F) { - # it can be seen that some accounts can start n harp the month the same that it ends in the dataset - harp1 = get_chunk.disk.frame(harp,1) - fmdf1 = get_chunk.disk.frame(fmdf,1) - - harp1[,date:=as.Date(monthly.rpt.prd,"%m/%d/%Y")] - fmdf1[,date:=as.Date(monthly.rpt.prd,"%m/%d/%Y")] - - fmdf2 = fmdf1[loan_id %in% unique(harp1$loan_id)] - - fmdf3 = merge( - fmdf2[,.(max_date = max(date)),loan_id], - harp1[,.(min_date = min(date)),loan_id], - by = "loan_id" - ) - - fmdf3[max_date == min_date,] - - fmdf3[max_date > min_date,] - fmdf3[max_date <= min_date,] - - harp4 = harp[,.N, delq.status, keep="delq.status"] -} - - -# print(nrow(harp)) -# system.time(harp <- merge(harp, harp_mapping, by="harp_id")) -# print(nrow(harp)) -# setkey(harp,"loan_id") -# system.time(fst::write_fst(harp,"harp.fst")) # 16.63 -#system.time(fst::write_fst(harp,"harp100.fst",100)) # 391 seconds - -# check if the harp_id -> loan_id is successful -if(F) { - harp_acq <- fread("C:/data/HARP_Files/Acquisition_HARP.txt") - fst::write_fst(harp_mapping,"harp_mapping.fst") - - harp[,date:=as.Date(month,"%m/%d/%Y")] - harp[,.N,date][order(date)] - fmdf = disk.frame("fmdf") - system.time(uid <- fmdf[,.(loan_id = unique(loan_id)), keep = "loan_id"][,unique(loan_id)]) - harp_uid = unique(harp_mapped$loan_id) - def = intersect(uid, harp_uid) - def -} \ No newline at end of file diff --git a/tutorials/fannie_mae/01f_append_harp_to_data_DEPRECATED.r b/tutorials/fannie_mae/01f_append_harp_to_data_DEPRECATED.r deleted file mode 100644 index 3d344ac8..00000000 --- a/tutorials/fannie_mae/01f_append_harp_to_data_DEPRECATED.r +++ /dev/null @@ -1,40 +0,0 @@ -#2d_harp_check.r -source("inst/fannie_mae/0_setup.r") - -# system.time(harp <- fread("D:/data/fannie_mae/HARP_Files/Performance_HARP.txt", select = c("V1","V2"), colClasses = "c")) -# setnames(harp, names(harp), c("harp_id","month")) -# -# harp_mapping <- fread("D:/data/fannie_mae/HARP_Files/Loan_Mapping.txt", colClasses = "c", col.names = c("loan_id", "harp_id")) -# fst::write_fst(harp_mapping,"harp_mapping.fst") -# -# harp_mapped = merge(harp, harp_mapping, by="harp_id") -# c(nrow(harp), nrow(harp_mapped)) -# -# harp_acq <- fread("d:/data/fannie_mae/HARP_Files/Acquisition_HARP.txt") -# fst::write_fst(harp_mapping,"harp_mapping.fst") -# -# harp[,date:=as.Date(month,"%m/%d/%Y")] -# harp[,.N,date][order(date)] - -system.time(harp <- fst::read_fst("harp.fst")) - -fmdf = disk.frame("fmdf") - -if(F) { - # this was run once to confimr that the -} -system.time(uid <- fmdf[,.(loan_id = unique(loan_id)), keep = "loan_id"][,unique(loan_id)]) -harp_uid = unique(harp_mapped$loan_id) - -def = intersect(uid, harp_uid) -def - -system.time(harp.df <- shard(harp,"loan_id",500, "harp.df", overwrite = T, append = F)) - -system.time(fmdfh <- rbindlist.disk.frame(list(fmdf, harp.df), outdir = "fmdf1_w_harp")) - -# nothing in the intersection -# so either HARP ids were changed once there are in the harp program - - - diff --git a/tutorials/fannie_mae/01g_compress_it_DEPRECATED.r b/tutorials/fannie_mae/01g_compress_it_DEPRECATED.r deleted file mode 100644 index 937bd874..00000000 --- a/tutorials/fannie_mae/01g_compress_it_DEPRECATED.r +++ /dev/null @@ -1,10 +0,0 @@ -source("inst/fannie_mae/00_setup.r") - -# create a most compressed set -a = disk.frame("fmdf") -chunk_lapply(a,function(x) x, outdir="fmdf0", chunks=500, compress=100) - -a1 = disk.frame("fmdf0") - -system.time(print(a[,.N,keep="delq.status"] %>% sum)) -system.time(print(a1[,.N,keep="delq.status"] %>% sum)) diff --git a/tutorials/fannie_mae/02_exploratory.r b/tutorials/fannie_mae/02_exploratory.r deleted file mode 100644 index 2a362d81..00000000 --- a/tutorials/fannie_mae/02_exploratory.r +++ /dev/null @@ -1,19 +0,0 @@ -# 2_exploratory.r -source("inst/fannie_mae/0_setup.r") - -fmdf = disk.frame("fmdf0") - -system.time(a <- fmdf[,.N,delq.status,keep="delq.status"][,.(N=sum(N)),delq.status]) -setkey(a,delq.status); a - -str(a) - -str(head(fmdf)) - -b = fmdf[,.N,monthly.rpt.prd,keep="monthly.rpt.prd"][,sum(N),monthly.rpt.prd]; b - - -names(fmdf) -servicer.name_cnt = fmdf[,.N,servicer.name,keep=c("servicer.name")] - -servicer.name_cnt[,sum(N),servicer.name] diff --git a/tutorials/fannie_mae/02a_create_forward_looking_flag.r b/tutorials/fannie_mae/02a_create_forward_looking_flag.r deleted file mode 100644 index 2d7dc6bc..00000000 --- a/tutorials/fannie_mae/02a_create_forward_looking_flag.r +++ /dev/null @@ -1,97 +0,0 @@ -# 2_exploratory.r -source("inst/fannie_mae/0_setup.r") - -fmdf = disk.frame(file.path(outpath, "fm.df")) -harp = disk.frame(file.path(outpath, "harp.df")) - -# harp contains the first date on which a loan enters into HARP -pt <- proc.time() -harp1 <- harp %>% - srckeep(c("loan_id","monthly.rpt.prd")) %>% - group_by(loan_id, hard = F) %>% # the data is sharded by loan_id hence hard = F is fine - summarise(first_harp_date = min(as.Date(monthly.rpt.prd, "%m/%d/%Y"))) %>% - collect(parallel = T) # performs the collection in parallel - -cat(glue("Creating first harp date took: {timetaken(pt)}")) - -if(F) { - # data.table syntax - harp[,.(first_harp_date = min(as.Date(monthly.rpt.prd, "%m/%d/%Y"))), loan_id, - keep=c("loan_id", "monthly.rpt.prd")] -} - -harp1[,before_12m_first_harp_date:=first_harp_date] -# 164 seconds -system.time(lubridate::month(harp1$before_12m_first_harp_date) <- lubridate::month(harp1$before_12m_first_harp_date) - 12) -harp1[,harp_12m := TRUE] - -# break it out into smaller chunks for even faster merging -# took 27 -pt <- proc.time() -harp2 <- shard( - harp1, - "loan_id", - nchunks = nchunks(fmdf), - outdir = file.path(outpath, "first_harp_date.df"), - overwrite=T) -cat(glue::glue("sharding HARP defaults took {timetaken(pt)}")) - -# no need to hard group by it's already sharded by loan_id -# took about 503 seconds -# took about 11:52 -pt <- proc.time() -defaults <- cmap(fmdf, function(df) { - # create the default flag - df[,date:=as.Date(monthly.rpt.prd,"%m/%d/%Y")] - setkey(df,loan_id, date) - df[,delq.status := as.integer(delq.status)] - df[,default:=delq.status>=3] - df[is.na(default),default:=F] - - df2 = df[default==T,.(loan_id, start_date =date, end_date = date)] - lubridate::month(df2$start_date) <- lubridate::month(df2$start_date)-12 - - # get rid of overlaps - df2[order(start_date),lag_end_date := shift(end_date,1), loan_id] - - df2[,grp_incr := 1] - df2[start_date <= lag_end_date, grp_incr := 0, loan_id] - - df2[,grp:=cumsum(grp_incr)] - df3 = df2[,.(start_date = min(start_date), end_date = max(end_date), default_12m=T),.(loan_id,grp)] - - df3[,grp:=NULL] - - # create the hardship flag - # whether the customer goes into in the next 12 months hardship - df3 - }, - keep = c("monthly.rpt.prd", "delq.status", "loan_id"), - outdir = file.path(outpath, "defaults.df"), - lazy = F, - overwrite = T) -cat(glue::glue("Creating forward looking default flag took {timetaken(pt)}")) - -# -# a = df[loan_id == "100513171914", ] -# -# a[,monthly.rpt.prd := as.Date(monthly.rpt.prd,"%m/%d/%Y")] -# a[,monthly.rpt.prd2:=monthly.rpt.prd] -# -# -# setkey(df3,"loan_id","start_date","end_date") -# -# a1= foverlaps(a, df3, by.x = c("loan_id","monthly.rpt.prd","monthly.rpt.prd2"), -# by.y=c("loan_id","start_date","end_date")) -# -# a1[,.(loan_id,monthly.rpt.prd,start_date, end_date)] - -# create a list that show defaults ---------------------------------------- -# system.time(defaults <- chunk_lapply(fmdf, function(df) { -# df[,date:=as.Date(monthly.rpt.prd,"%m/%d/%Y")] -# df[,delq.status := as.integer(delq.status)] -# df[,default:=delq.status>=3] -# df[is.na(default),default:=F] -# df1 = df[default == F] -# df1 -# }, outdir="tmp1")) diff --git a/tutorials/fannie_mae/02b_foverlaps.r b/tutorials/fannie_mae/02b_foverlaps.r deleted file mode 100644 index a89d107a..00000000 --- a/tutorials/fannie_mae/02b_foverlaps.r +++ /dev/null @@ -1,45 +0,0 @@ -# 2_exploratory.r -source("inst/fannie_mae/0_setup.r") - -fmdf = disk.frame(file.path(outpath, "fm.df")) -tmp2 = disk.frame(file.path(outpath, "defaults.df")) -harp2 = disk.frame(file.path("first_harp_date.df")) - -fmdf_lazy = delayed(fmdf, function(df) { - df[,monthly.rpt.prd:=as.Date(monthly.rpt.prd,"%m/%d/%Y")] - df[,monthly.rpt.prd2:=monthly.rpt.prd] -}) - -tmp2_lazy = delayed(tmp2, function(df) { - setkey(df, "loan_id","start_date","end_date") -}) - -harp2_lazy = delayed(harp2, function(df) { - setkey(df, "loan_id","before_12m_first_harp_date","first_harp_date") -}) - -# took 513 seconds -pt <- proc.time() -tmp3 <- foverlaps.disk.frame( - fmdf_lazy, - tmp2_lazy, - outdir=file.path(outpath, "fm_with_default"), - by.x = c("loan_id", "monthly.rpt.prd", "monthly.rpt.prd2"), - by.y = c("loan_id", "start_date", "end_date"), - merge_by_chunk_id = T, - overwrite = T -) -cat(glue::glue("time taken to merge on default flag {timetaken(pt)}")) - -# took 6:55 -pt <- proc.time() -tmp4 <- foverlaps.disk.frame( - tmp3, - harp2_lazy, - outdir=file.path(outpath, "fm_with_harp"), - by.x = c("loan_id", "monthly.rpt.prd", "monthly.rpt.prd2"), - by.y = c("loan_id", "before_12m_first_harp_date", "first_harp_date"), - merge_by_chunk_id = T, - overwrite = T -) -cat(glue::glue("time taken to merge on HARP flag {timetaken(pt)}")) \ No newline at end of file diff --git a/tutorials/fannie_mae/02b_foverlaps_Reduced16.r b/tutorials/fannie_mae/02b_foverlaps_Reduced16.r deleted file mode 100644 index 825d28d4..00000000 --- a/tutorials/fannie_mae/02b_foverlaps_Reduced16.r +++ /dev/null @@ -1,68 +0,0 @@ -# 2_exploratory.r -source("inst/fannie_mae/0_setup.r") -pt <- proc.time() - -fmdf = disk.frame("only16.df") -tmp2 = disk.frame("defaults.df") # tmp2 contains all the defaults -harp2 = disk.frame("first_harp_date.df") - -fmdf_lazy = delayed(fmdf, function(df) { - df[,monthly.rpt.prd:=as.Date(monthly.rpt.prd,"%m/%d/%Y")] - df[,monthly.rpt.prd2:=monthly.rpt.prd] -}) - -tmp2_lazy = delayed(tmp2, function(df) { - setkey(df, "loan_id","start_date","end_date") -}) - -harp2_lazy = lazy(harp2, function(df) { - setkey(df, "loan_id","before_12m_first_harp_date","first_harp_date") -}) - -#dh <- rbindlist.disk.frame(tmp2_lazy, harp2_lazy, outdir ="defaults_harp.df", by_chunk_id = T) -#plan(transparent) -system.time(tmp3 <- foverlaps.disk.frame( - fmdf_lazy, - tmp2_lazy, - outdir="tmp3_16", - by.x = c("loan_id", "monthly.rpt.prd", "monthly.rpt.prd2"), - by.y = c("loan_id", "start_date", "end_date"), - merge_by_chunk_id = T, - compress = 50 -)) - - -system.time(tmp4 <- foverlaps.disk.frame( - tmp3, - harp2_lazy, - outdir="tmp4_16", - by.x = c("loan_id", "monthly.rpt.prd", "monthly.rpt.prd2"), - by.y = c("loan_id", "before_12m_first_harp_date", "first_harp_date"), - merge_by_chunk_id = T, - compress = 50 -)) - -data.table::timetaken(pt) - - -tmp4 <- disk.frame("tmp4_16") -system.time(tmp5 <- tmp4[,.(ndef=sum(default_12m, na.rm=T),.N), .(monthly.rpt.prd, delq.status), - keep=c("default_12m","monthly.rpt.prd","delq.status")]) - - -tmp5[, delq.status.cap := pmin(5, as.integer(delq.status))] -tmp5[delq.status == "X", delq.status.cap := 0] -tmp5[delq.status == "", delq.status.cap := 0] - - -tmp6 <- tmp5[,.(ndef=sum(ndef), N=sum(N)),.(monthly.rpt.prd, delq.status.cap)] - - -tmp6[,defr:=ndef/N] - -library(ggplot2) -tmp6 %>% - ggplot + - geom_line(aes(x=monthly.rpt.prd, y = defr, colour=as.factor(delq.status.cap))) - - diff --git a/tutorials/fannie_mae/02c_plot_odr.r b/tutorials/fannie_mae/02c_plot_odr.r deleted file mode 100644 index 4521511a..00000000 --- a/tutorials/fannie_mae/02c_plot_odr.r +++ /dev/null @@ -1,45 +0,0 @@ -# 2_exploratory.r -source("inst/fannie_mae/0_setup.r") - -fm_with_harp = disk.frame(file.path(outpath, "fm_with_harp")) - -head(fm_with_harp) - -# need a two stage summary -system.time(a_wh1 <- fm_with_harp %>% - srckeep(c("default_12m","harp_12m","monthly.rpt.prd")) %>% - group_by(monthly.rpt.prd, hard = F) %>% - summarise( - N = n(), - n_defaults = sum(default_12m, na.rm = T), - n_goto_harp = sum(harp_12m, na.rm=T)) %>% - collect(parallel = T) %>% - group_by(monthly.rpt.prd) %>% - summarise( - odr = sum(n_defaults)/sum(N), - oh = sum(n_goto_harp)/sum(N) - ) %>% - rename( - Date = monthly.rpt.prd, - `Observed Default Rate%` = odr, - `HARP Conversion Rate%` = oh - )) - - -if(F) { - # data.table syntax - system.time(a_wh <- tmp4[ - ,.(.N, n_defaults = sum(default_12m, na.rm = T), ) - ,.(monthly.rpt.prd), keep=c("default_12m","harp_12m","monthly.rpt.prd")]) - - a_wh1 = a_wh[,.(odr = sum(n_defaults)/sum(N), oh = sum(n_goto_harp)/sum(N)), monthly.rpt.prd] - setnames(a_wh1, c("monthly.rpt.prd","odr", "oh"), c("Date", "Observed Default Rate%", "HARP Conversion Rate%")) -} - -a_wh2 = a_wh1 %>% gather(key = type, value=rate, -Date) - -ggplot(a_wh2) + - geom_line(aes(x=Date, y = rate, colour = type)) + - ggtitle("Fannie Mae Observed Default Rate over time & HARP Conversion Rate") - - diff --git a/tutorials/fannie_mae/02d_harp_check.r b/tutorials/fannie_mae/02d_harp_check.r deleted file mode 100644 index f9fd020a..00000000 --- a/tutorials/fannie_mae/02d_harp_check.r +++ /dev/null @@ -1,33 +0,0 @@ -#2d_harp_check.r -source("inst/fannie_mae/0_setup.r") - -system.time(harp <- fread("D:/data/fannie_mae/HARP_Files/Performance_HARP.txt", select = c("V1","V2"), colClasses = "c")) -setnames(harp,names(harp), c("harp_id","month")) - -harp_mapping <- fread("D:/data/fannie_mae/HARP_Files/Loan_Mapping.txt", colClasses = "c", col.names = c("loan_id", "harp_id")) -fst::write_fst(harp_mapping,"harp_mapping.fst") - -harp_mapped = merge(harp, harp_mapping, by="harp_id") -c(nrow(harp), nrow(harp_mapped)) - -harp_acq <- fread("d:/data/fannie_mae/HARP_Files/Acquisition_HARP.txt") -fst::write_fst(harp_mapping,"harp_mapping.fst") - -harp[,date:=as.Date(month,"%m/%d/%Y")] -harp[,.N,date][order(date)] - -fmdf = disk.frame("fmdf") - -system.time(uid <- fmdf[,.(loan_id = unique(loan_id)), keep = "loan_id"][,unique(loan_id)]) - -harp_uid = unique(harp_mapped$loan_id) - -def = intersect(uid, harp_uid) -def - -system.time(harp.df <- shard(harp,"loan_id", nchunks = nchunks(fmdf), outdir = "harp.df", overwrite = T)) - -system.time(fmdfh <- rbindlist.disk.frame(list(fmdf, harp.df), outdir = "fmdf1_w_harp")) - -# nothing in the intersection -# so either HARP ids were changed once there are in the harp program \ No newline at end of file diff --git a/tutorials/fannie_mae/10_a_one_var.R b/tutorials/fannie_mae/10_a_one_var.R deleted file mode 100644 index fae7540c..00000000 --- a/tutorials/fannie_mae/10_a_one_var.R +++ /dev/null @@ -1,191 +0,0 @@ -source("inst/fannie_mae/0_setup.r") -library(disk.frame) - -acqall1 = disk.frame(file.path(outpath, "appl_mdl_data_sampled_dev2")) - -# XGBoost on one var ------------------------------------------------------ -system.time(xy <- acqall1[,c("default_next_12m", "oltv"), keep=c("default_next_12m", "oltv")]) - -# transform into format accepted by XGBoost -dtrain <- xgb.DMatrix(label = xy$default_next_12m, data = as.matrix(xy[,"oltv"])) - -portfolio_default_rate = xy[,sum(default_next_12m, na.rm = T)/.N] - -pt = proc.time() -m <- xgboost( - data=dtrain, - nrounds = 1, - objective = "binary:logitraw", - tree_method="exact", - monotone_constraints = 1, - base_score = portfolio_default_rate) -timetaken(pt) - -prev_pred = predict(m, dtrain) - -map_chr(xgb.dump(m), ~str_extract(.x,"\\[f0<[\\d]+\\.[\\d]+\\]")) %>% - keep(~!is.na(.x)) %>% - map_dbl(~str_extract(.x, "[\\d]+\\.[\\d]+") %>% as.numeric) %>% - sort %>% - floor -> bins - -bb = xy[,.(ndef = sum(default_next_12m), .N, m1 = min(oltv), m2 = max(oltv)), .(bins = cut(oltv,c(-Inf,bins,Inf)))] -new_bins = sort(unique( bb$m2)) -bb = xy[,.(ndef = sum(default_next_12m), .N), .(bins = cut(oltv,c(-Inf,new_bins,Inf)))][order(bins)] - -setkey(bb, bins) -bb %>% - filter(!is.na(bins)) %>% - mutate(`Orig LTV Band` = bins, `Default Rate%` = ndef/N) %>% - ggplot + - geom_bar(aes(x = `Orig LTV Band`, y = `Default Rate%`), stat = 'identity') + - coord_flip() - -if(F) { - system.time(xyz <- acqall1[,c("default_next_12m", "oltv", "frst_dte"), keep=c("default_next_12m", "oltv", "frst_dte")]) - - xyz[,frst_yr := substr(frst_dte,4,7) %>% as.integer] - xyz[,frst_dte:=NULL] - - bb = xyz[,.(ndef = sum(default_next_12m), .N), .(bins = cut(oltv,c(-Inf,bins,Inf)), frst_yr)] - - bb[,binton := sum(N), bins] - bb[,dr := ndef/binton] - - setkey(bb, bins) - bb[order(frst_yr,decreasing = T),text_y_pos := cumsum(dr) - dr/2, bins] - bb[,text := substr(frst_yr, 3,4)] - - bb %>% - filter(!is.na(bins)) %>% - mutate(`Yr of Orig` = as.factor(frst_yr), `Orig LTV Band` = bins, `Default Rate%` = dr) %>% - ggplot + - geom_bar(aes(x = `Orig LTV Band`, y = `Default Rate%`, fill = `Yr of Orig`), stat = 'identity') + - geom_text(aes(x = `Orig LTV Band`, y = text_y_pos, label = text)) + - coord_flip() -} - - -# what if we fit them in rounded number --------------------------------------------------------- - -xy[oltv > 80, oltv_round := ceiling(oltv/5)*5] -xy[oltv <= 80, oltv_round := ceiling(oltv/10)*10] -xy[oltv <= 40, oltv_round := ceiling(oltv/20)*20] -#xy = xy[!is.na(oltv_round),] - -xy[is.na(oltv_round),] - -xy[,.N, oltv_round] - -# transform into format accepted by XGBoost -dtrain <- xgb.DMatrix(label = xy$default_next_12m, data = as.matrix(xy[,"oltv_round"])) - -pt = proc.time() -m <- xgboost( - data=dtrain, - nrounds = 1, - objective = "binary:logitraw", - tree_method="exact", - monotone_constraints = 1, - base_score = portfolio_default_rate) -timetaken(pt) - -map_chr(xgb.dump(m), ~str_extract(.x,"\\[f0<[\\d]+[\\.]{0,1}[\\d]+\\]")) %>% - keep(~!is.na(.x)) %>% - map_dbl(~str_extract(.x, "[\\d]+[\\.]{0,1}[\\d]+") %>% as.numeric) %>% - sort -> bins - -bb = xy[,.(ndef = sum(default_next_12m), .N, m1 = min(oltv_round), m2 = max(oltv_round)), .(bins = cut(oltv_round,c(-Inf,bins,Inf)))] -new_bins = sort(unique( bb$m2)) -bb = xy[,.(ndef = sum(default_next_12m), .N), .(bins = cut(oltv_round,c(-Inf,new_bins,Inf)))][order(bins)] -bb[,odr := ndef/N] - -setkey(bb, bins) -bb %>% - filter(!is.na(bins)) %>% - mutate(`Orig LTV Band` = bins, `Default Rate%` = ndef/N) %>% - ggplot + - geom_bar(aes(x = `Orig LTV Band`, y = `Default Rate%`), stat = 'identity') + - coord_flip() - - -prev_pred = predict(m, dtrain) - -prev_pred1 = predict(m, dtrain, predcontrib=T) - -# add another variable ---------------------------------------------------- -target = "default_next_12m" -feature = "orig_amt" -df = acqall1 -format_fn = base::I -existing_model = prev_pred -monotone_constraints = -1 - -# auc <- function(target, score) { -# df = data.table(target, score) -# -# df1 = df[,.(nt = sum(target), n = .N, score)] -# setkey(df1, score) -# } -# -# add_var_to_scorecard <- function(df, target, feature, monotone_constraints = 0, prev_pred = NULL, format_fn = base::I) { -# -# xy = df %>% -# srckeep(c(target, feature)) %>% -# collect(parallel = T) -# -# # evaluate -# code = glue::glue("xy = xy %>% mutate({feature} = format_fn({feature}))") -# eval(parse(text = code)) -# -# dtrain <- xgb.DMatrix(label = xy[,target, with = F][[1]], data = as.matrix(xy[,c(feature), with = F])) -# -# if(is.null(prev_pred)) { -# pt = proc.time() -# m2 <- xgboost( -# data=dtrain, -# nrounds = 1, -# objective = "binary:logitraw", -# tree_method="exact", -# monotone_constraints = monotone_constraints -# ) -# timetaken(pt) -# } else { -# setinfo(dtrain, "base_margin", prev_pred) -# pt = proc.time() -# m2 <- xgboost( -# data=dtrain, -# nrounds = 1, -# objective = "binary:logitraw", -# tree_method="exact", -# monotone_constraints = monotone_constraints -# ) -# timetaken(pt) -# -# a2 = predict(m2, dtrain) -# a3 = predict(m2, dtrain, predcontrib = T) -# } -# -# map_chr(xgb.dump(m2), ~str_extract(.x,"\\[f0<[\\d]+[\\.]{0,1}[\\d]+\\]")) %>% -# keep(~!is.na(.x)) %>% -# map_dbl(~str_extract(.x, "[\\d]+[\\.]{0,1}[\\d]+") %>% as.numeric) %>% -# sort -> bins -# -# code = glue::glue("bb = xy[,.(ndef = sum({target}), .N, m1 = min({feature}), m2 = max({feature})), .(bins = cut({feature},c(-Inf,bins,Inf)))]") -# eval(parse(text = code)) -# -# new_bins = sort(unique(bb$m2)) -# code1 = glue::glue("bb = xy[,.(ndef = sum(default_next_12m), .N), .(bins = cut({feature},c(-Inf,new_bins,Inf)))][order(bins)]") -# eval(parse(text = code1)) -# -# setkey(bb, bins) -# bb %>% -# filter(!is.na(bins)) %>% -# mutate(`Orig LTV Band` = bins, `Default Rate%` = ndef/N) %>% -# ggplot + -# geom_bar(aes(x = `Orig LTV Band`, y = `Default Rate%`), stat = 'identity') + -# coord_flip() -# -# -# } - diff --git a/tutorials/fannie_mae/10_a_one_var_with_fn.R b/tutorials/fannie_mae/10_a_one_var_with_fn.R deleted file mode 100644 index 687abcc7..00000000 --- a/tutorials/fannie_mae/10_a_one_var_with_fn.R +++ /dev/null @@ -1,56 +0,0 @@ -source("inst/fannie_mae/0_setup.r") -library(disk.frame) - -acqall1 = disk.frame(file.path(outpath, "appl_mdl_data_sampled_dev2")) - -pt <- proc.time() -firstm <- add_var_to_scorecard(acqall1, "default_next_12m", "oltv", monotone_constraints = 1, format_fn = function(v) { - ceiling(v / 5) * 5 -}) -timetaken(pt) - -firstm - -# now add the second variable --------------------------------------------- -pt <- proc.time() -secondm <- add_var_to_scorecard( - acqall1, - "default_next_12m", - "dti", - prev_pred = firstm$prev_pred, - monotone_constraints = 1) -timetaken(pt) - -pt <- proc.time() -thirdm <- add_var_to_scorecard( - acqall1, - "default_next_12m", - "orig_trm", - prev_pred = secondm$prev_pred, - monotone_constraints = 1) -timetaken(pt) - -pt <- proc.time() -fourthm <- add_var_to_scorecard( - acqall1, - "default_next_12m", - "num_bo", - prev_pred = thirdm$prev_pred, - monotone_constraints = -1, - format_fn = as.numeric) -timetaken(pt) - -pt <- proc.time() -fifthm <- add_var_to_scorecard( - acqall1, - "default_next_12m", - "cscore_b", - prev_pred = fourthm$prev_pred, - monotone_constraints = -1) -timetaken(pt) - - -# plot improvement -------------------------------------------------------- -plot(map_dbl(list(firstm, secondm, thirdm, fourthm, fifthm), ~.x$auc), type="b", ylim = c(0.4,0.9)) -abline(h = 0.7, lty = 2) - diff --git a/tutorials/fannie_mae/10_b_two_vars.r b/tutorials/fannie_mae/10_b_two_vars.r deleted file mode 100644 index d0df6cd4..00000000 --- a/tutorials/fannie_mae/10_b_two_vars.r +++ /dev/null @@ -1,61 +0,0 @@ -source("inst/fannie_mae/0_setup.r") -library(disk.frame) - -acqall1 = disk.frame(file.path(outpath, "appl_mdl_data_sampled_dev2")) - -pt <- proc.time() -firstm <- add_var_to_scorecard( - acqall1, - "default_next_12m", - "oltv", - monotone_constraints = 1, - format_fn = function(v) { - ceiling(v / 5) * 5 - }) -timetaken(pt) - -firstm - -# now add the second variable --------------------------------------------- -pt <- proc.time() -secondm <- add_var_to_scorecard( - acqall1, - "default_next_12m", - "dti", - prev_pred = firstm$prev_pred, - monotone_constraints = 1) -timetaken(pt) - -pt <- proc.time() -thirdm <- add_var_to_scorecard( - acqall1, - "default_next_12m", - "orig_trm", - prev_pred = secondm$prev_pred, - monotone_constraints = 1) -timetaken(pt) - -pt <- proc.time() -fourthm <- add_var_to_scorecard( - acqall1, - "default_next_12m", - "num_bo", - prev_pred = thirdm$prev_pred, - monotone_constraints = -1, - format_fn = as.numeric) -timetaken(pt) - -pt <- proc.time() -fifthm <- add_var_to_scorecard( - acqall1, - "default_next_12m", - "cscore_b", - prev_pred = fourthm$prev_pred, - monotone_constraints = -1) -timetaken(pt) - - -# plot improvement -------------------------------------------------------- -#plot(map_dbl(list(firstm, secondm, thirdm, fourthm, fifthm), ~.x$auc), type="b", ylim = c(0.4,0.9)) -#abline(h = 0.7, lty = 2) - diff --git a/tutorials/fannie_mae/10a_AutoML_XGBoost_Scorecards.r b/tutorials/fannie_mae/10a_AutoML_XGBoost_Scorecards.r deleted file mode 100644 index bd708ed5..00000000 --- a/tutorials/fannie_mae/10a_AutoML_XGBoost_Scorecards.r +++ /dev/null @@ -1,90 +0,0 @@ -source("inst/fannie_mae/0_setup.r") -library(disk.frame) - -acqall_dev = disk.frame(file.path(outpath, "appl_mdl_data_sampled_dev2")) - - -# 5d develop a function to test all variables -check_which_is_best <- function(df, target, features, monotone_constraints, format_fns, weight=NULL) { - - prev_pred = NULL - ws = NULL - vars_scr = NULL - - while(length(features) > 0) { - # try every var - - res = furrr::future_map(1:length(features), ~add_var_to_scorecard( - #res = purrr::map(1:length(features), ~add_var_to_scorecard( - df, - target, - features[.x], - monotone_constraints = monotone_constraints[.x], - prev_pred = prev_pred, - format_fn = format_fns[[.x]], - weight, - save_model_fname = glue::glue("{features[.x]}.xgbm") - )) - - # which has the best auc - w = which.max(map_dbl(res, ~.x$auc)) - ws = c(ws, w) - feature_to_fit = features[w] - print(glue::glue("choosen: {feature_to_fit}")) - - var_scr1 = res[[w]] - eval(parse(text = glue::glue("vars_scr = c(vars_scr, list({feature_to_fit} = var_scr1))"))) - - features = features[-w] - format_fns = format_fns[-w] - monotone_constraints = monotone_constraints[-w] - - prev_pred = var_scr1$prev_pred - } - vars_scr -} - -num_vars = c("mi_pct", "orig_amt", "orig_rt", "ocltv", "mi_type", "cscore_c", "oltv", "dti", "orig_trm", "cscore_b", "num_bo", "num_unit", "orig_dte", "frst_dte") -num_vars_mon = c(0 , 1 , 1 , 1 , 0 , -1 , 1 , 1 , 1 , -1 , -1 , 0 , 0 , 0) -num_var_fmt_fn = c(map(1:10, ~base::I), map(1:2, ~as.numeric), map(1:2, ~function(x) { - as.numeric(substr(x, 4, 7)) -})) - -cat_vars = setdiff(names(acqall_dev), num_vars) %>% - setdiff( - c("loan_id", "default_next_12m", "first_default_date", "mths_to_1st_default", "zip_3", "weight")) - -pt = proc.time() -res_all = check_which_is_best( - acqall_dev, - "default_next_12m", - features = c(num_vars, cat_vars), - monotone_constraints = c(num_vars_mon, rep(0, length(cat_vars))), - format_fns = c(num_var_fmt_fn, map(1:length(cat_vars), ~base::I)), - weight = "weight" -) -timetaken(pt) - -plot(map_dbl(res_all, ~.x$auc)) - -#saveRDS(res_all, "model.rds") - -if(F) { - rescat = check_which_is_best( - acqall_dev, - "default_next_12m", - c("seller.name", "oltv", "state", "dti", "mi_type"), - c(0 , 1 , 0 , 1 , 0 ), - map(1:5, ~base::I) - ) - - - system.time(res <- check_which_is_best( - acqall1, - "default_next_12m", - c("ocltv", "cscore_c", "mi_type", "oltv", "dti", "orig_trm", "cscore_b", "num_bo"), - c(1 , -1 , 0 , 1 , 1 , 1 , -1 , -1 ), - c(map(1:7, ~base::I), list(as.numeric)))) -} - - diff --git a/tutorials/fannie_mae/10b_make_scorecard_visible.R b/tutorials/fannie_mae/10b_make_scorecard_visible.R deleted file mode 100644 index 36fb832d..00000000 --- a/tutorials/fannie_mae/10b_make_scorecard_visible.R +++ /dev/null @@ -1,109 +0,0 @@ -source("inst/fannie_mae/0_setup.r") -library(disk.frame) - -acqall_val = disk.frame(file.path(outpath, "appl_mdl_data_sampled_val2")) -df = acqall_val -#the model -mdl = readRDS("model.rds") - -plot(purrr::map_dbl(mdl, ~.x$auc)) - -#' score one variable -score_one_var <- function(feature, x, bins, bias, format_fn = base::I) { - print(feature) - # numeric features (post format_fn) has only two columns in bins - if(ncol(bins) == 2) { - x1 = format_fn(x) - binsNA = evalparseglue("bins[is.na({feature}),]") - bins = evalparseglue("bins[!is.na({feature}),]") - score = bins$score[cut(x1, c(-Inf, bins[[2]], Inf))] - - if(nrow(binsNA) == 1) { - score[is.na(x1)] = binsNA[[1]] - } - } else { - x1 = format_fn(x) - x1df = evalparseglue("data.table({feature} = x1)") - x1df_scr = evalparseglue("merge(x1df, bins, by = '{feature}', all.x=T)") - score = x1df_scr$score - } - # if there are still na then assign the bias - #browser(expr = any(is.na(score))) - if(any(is.na(score))) { - warning(glue::glue("the feature `{feature}` has {sum(is.na(score))} unassigned scores out of {length(score)}, assigning them the bias = {bias}")) - score[is.na(score)] = bias - } - - score -} - - -#' Socre a scorecard -#' @param mdl the xg scorecard model -#' @param df a disk.frame -score_xg_scorecard <- function(df, mdl) { - res = furrr::future_map_dfc(mdl, ~{ - #res = purrr::map_dfc(mdl, ~{ - var = df %>% - srckeep(.x$feature) %>% - collect(parallel = F) - x = var[[1]] - feature = .x$feature - bins = .x$bins - bias = .x$bias - format_fn = .x$format_fn - score_one_var(feature, x, bins, bias, format_fn) - }) %>% rowSums - res -} - -plot_auc <- function(df) { - target = df %>% - srckeep("default_next_12m") %>% - collect(parallel = F) - - pt = proc.time() - target$score = score_xg_scorecard(df, mdl) - timetaken(pt) - - AUC = auc(target$default_next_12m, target$score) - GINI = 2*AUC-1 - - target[,negscore := -score] - setkey(target, negscore) - cts = target[,.(ctot = (1:.N)/.N, cbad = cumsum(default_next_12m)/sum(default_next_12m))] - - plot(cts[seq(1,.N, length.out=100),], type="l", xlim=c(0,1), - main=glue::glue("On Validation - AUC: {AUC %>% round(2)}; GINI: {GINI %>% round(2)}")) - abline(a=0,b=1,lty=2) - abline(h=0) - abline(v=0) -} - - -system.time(plot_auc(acqall_val)) - -# score on whole ---------------------------------------------------------- -acqall_dev = disk.frame(file.path(outpath, "appl_mdl_data_sampled_dev2")) -system.time(plot_auc(acqall_dev)) - -scorecard = map_dfr(mdl, ~{ - res = .x$bins - evalparseglue("res[,feature_lbl := as.character({.x$feature})]") - res[,variable := .x$feature] - res %>% - select(variable, feature_lbl, score ) %>% - mutate(score = round(-score*20/log(2))) -}) - -saveRDS(scorecard, "scorecard.rds") -scorecard = readRDS("scorecard.rds") -View(scorecard) - -DT::datatable(scorecard) - -# score on whole ---------------------------------------------------------- -# acqall = disk.frame(file.path(outpath, "appl_mdl_data")) -# system.time(plot_auc(acqall)) - - diff --git a/tutorials/fannie_mae/11_speedglm.R b/tutorials/fannie_mae/11_speedglm.R deleted file mode 100644 index 17b38269..00000000 --- a/tutorials/fannie_mae/11_speedglm.R +++ /dev/null @@ -1,50 +0,0 @@ -source("inst/fannie_mae/0_setup.r") -library(disk.frame) -library(speedglm) -library(biglm) - -acqall_dev = disk.frame(file.path(outpath, "appl_mdl_data_sampled_dev2")) -library(speedglm) - -head(acqall_dev) - -#' A streaming function for speedglm -#' @param df a disk.frame -stream_shglm <- function(df) { - i = 0 - is = sample(nchunks(df), replace = F) - function(reset = F) { - - if(reset) { - print("you've reset") - i <<- 0 - } else { - i <<- i + 1 - #if (i > 3) { - if (i > nchunks(df)) { - return(NULL) - } - print(glue::glue("{i}/{nchunks(df)}")) - return(get_chunk(df, is[i])) - #return(get_chunk(df, i)) - } - } -} - -acqall_dev1 = acqall_dev %>% delayed(~{ - .x[,oltv_band := cut(oltv, c(-Inf, seq(60,100,by=20)))] - .x[,dti_band := addNA(cut(dti, c(-Inf, c(20,100), Inf)))] - .x -}) - -streamacq = stream_shglm(acqall_dev1) - -m = bigglm( - default_next_12m ~ oltv_band-1, - data = streamacq, - family=binomial()) -summary(m) - -shglm(default_next_12m ~ oltv_band -1 #+ dti_band + - 1 - , - datafun = streamacq, family=binomial()) \ No newline at end of file diff --git a/tutorials/fannie_mae/12_keras.r b/tutorials/fannie_mae/12_keras.r deleted file mode 100644 index 732f02a3..00000000 --- a/tutorials/fannie_mae/12_keras.r +++ /dev/null @@ -1,178 +0,0 @@ -source("inst/fannie_mae/0_setup.r") -library(disk.frame) -library(keras) - -acqall_dev = disk.frame(file.path(outpath, "appl_mdl_data_sampled_dev2")) - -#' A streaming function for speedglm -#' @param df a disk.frame -stream_shglm <- function(df) { - i = 0 - is = sample(nchunks(df), replace = F) - function(reset = F) { - - if(reset) { - print("you've reset") - i <<- 0 - } else { - i <<- i + 1 - #if (i > 4) { - if (i > nchunks(df)) { - return(NULL) - } - print(glue::glue("{i}/{nchunks(df)}")) - return(get_chunk(df, is[i])) - #return(get_chunk(df, i)) - } - } -} - -acqall_dev1 = acqall_dev %>% delayed(~{ - .x[,oltv_band := cut(oltv, c(-Inf, 60,80, Inf))] - #.x[,scr_band := addNA(cut(cscore_b, c(-Inf, 627,700, Inf)), ifany=F)] - .x[,scr_band := addNA(cut(cscore_b, c(-Inf, 700,716,725,742,748,766,794, Inf)), ifany=F)] - - .x -}) - -if(F) { - aa = acqall_dev1 %>% collect - glm(default_next_12m ~ oltv_band + scr_band - 1, data=aa) -} - -head(acqall_dev1) - -build_model <- function() { - model <- keras_model_sequential() %>% - layer_dense(units = 2, activation = 'softmax') - - - model %>% compile( - loss = "categorical_crossentropy", - optimizer = 'sgd' - ) - - model -} - -model = build_model() - -#ol = levels(acqall_dev1[,oltv_band, keep=c("oltv","cscore_b")]) -#dl = levels(acqall_dev1[,scr_band, keep=c("oltv","cscore_b")]) - -ol = levels(get_chunk(acqall_dev1,1)[,oltv_band]) -dl = levels(get_chunk(acqall_dev1,1)[,scr_band]) - - -#for(i in 1:nchunks(acqall_dev1)) { -kk <- function() { - - j = 0 - done = F - ii = 0 - while(!done) { - j <- j + 1 - si = sample(nchunks(acqall_dev1), nchunks(acqall_dev1)*0.7) - osi = setdiff(1:nchunks(acqall_dev1), si) - - system.time(a <- map_dfr(si, ~{ - - ii <- ii + 1 - i = .x - if(ii %% 20 == 0) print(glue::glue("{j}:{ii} {Sys.time()}")) - - a = get_chunk(acqall_dev1, i) - #a[,.(sum(default_next_12m), .N),oltv_band] - - a1 = - cbind( - a[,keras::to_categorical(as.integer(oltv_band) - 1)] - #,a[,keras::to_categorical(as.integer(scr_band) - 1)] - ) - - at = a[,default_next_12m*1] - Y_train = keras::to_categorical(at) - - hist = model %>% fit( - a1, - Y_train, - epochs = 1, - validation_split = 0.2, - verbose = 0 - ) - - gw = get_weights(model) - - gwdt = as.data.table(gw[1]) - setnames(gwdt, names(gwdt), c("non_default", "default")) - gwdt$i =i - gwdt$var = c( - rep("oltv_band", length(ol)) - #, rep("scr_band", length(dl)) - ) - gwdt$band = c(ol,dl - ) - gwdt - })) - - some_chunks <- map(osi, ~{ - i = .x - a = get_chunk(acqall_dev1, i) - - a1 = - cbind( - a[,keras::to_categorical(as.integer(oltv_band) -1)] - #,a[,keras::to_categorical(as.integer(scr_band)-1)] - ) - a1 - }) %>% reduce(rbind) - - outcomes <- map(osi, ~{ - i = .x - a = get_chunk(acqall_dev, i, keep=c("default_next_12m")) - a[,default_next_12m] - }) %>% unlist - - auc = auc(outcomes, predict(model, some_chunks)[,2]) - - # scores - p = predict(model, diag( - length(ol) - #+length(dl) - ))[,2] - a_b = log(p/(1-p)) - - scalar = 20/log(2) - intercept = 20*log(536870912/15)/log(2) - - a = mean(a_b) - b = a_b - a - done <- length(unique(sign(diff(b)))) == 1 - - scrs = data.table(base_scr = round(a*scalar+intercept,0), scr = round(-b*scalar,0)) - scrs$var = c( - rep("oltv_band", length(ol)) - #, rep("scr_band", length(dl)) - ) - scrs$band = c(ol - , dl - ) - scrs = scrs[band != "bias"] - - scrs = scrs[,.(var, band, scr, base_scr)] - print(glue::glue("AUC: {auc}")) - print(scrs) - } - scrs -} - -pt = proc.time() -scrs = kk() -timetaken(pt) -View(scrs) - -#AUC: 0.599790504924901 -# var band scr base_scr -# 1: oltv_band (-Inf,60] 10 351 -# 2: oltv_band (60,80] 10 351 -# 3: oltv_band (80, Inf] -21 351 \ No newline at end of file diff --git a/tutorials/fannie_mae/13_disk.r b/tutorials/fannie_mae/13_disk.r deleted file mode 100644 index 10adc926..00000000 --- a/tutorials/fannie_mae/13_disk.r +++ /dev/null @@ -1,25 +0,0 @@ -source("inst/fannie_mae/0_setup.r") -library(disk.frame) -library(keras) - -fmdf = disk.frame(file.path(outpath, "fm_with_harp")) - -head(fmdf) - -nrow(fmdf) - -pt = proc.time() -default_rate_over_time <- fmdf %>% - srckeep(c("default_12m", "monthly.rpt.prd")) %>% - group_by(monthly.rpt.prd, hard = F) %>% - summarise(ndef = sum(default_12m, na.rm=T), n = n()) %>% - collect %>% - group_by(monthly.rpt.prd) %>% - summarise(ndef = sum(ndef), n = sum(n)) %>% - mutate(default_rate = ndef/n) -timetaken(pt) - -default_rate_over_time %>% - ggplot + - geom_line(aes(x = monthly.rpt.prd, y = default_rate)) + - ggtitle("Fannie Mae Default Rate%") diff --git a/tutorials/fannie_mae/3a_get_some_data.r b/tutorials/fannie_mae/3a_get_some_data.r deleted file mode 100644 index b86128a8..00000000 --- a/tutorials/fannie_mae/3a_get_some_data.r +++ /dev/null @@ -1,113 +0,0 @@ -library(disk.frame) - -source("inst/fannie_mae/0_setup.r") - -a = disk.frame(file.path(outpath, "fm_with_harp")) - -pt = proc.time() -a1 <- a %>% - srckeep(c("loan_id","monthly.rpt.prd","delq.status", "default_12m")) %>% - delayed(~{ - setkey(.x, loan_id) - uid = sample_frac(.x[,.(loan_id = unique(loan_id))], 0.01) - .x[uid,] -}) %>% collect(parallel = T) -timetaken(pt) - -a1[delq.status != "X",delq.statusn := as.numeric(delq.status)] -a1[delq.status %in% c("","X"),delq.statusn := 0] - -#a1[, delq.statusn := pmin(delq.statusn, 3)] - -a1[,.N,delq.statusn] - -a1[is.na(delq.statusn)] - -# remove those that have already defulat -a2 = a1 - -a2[is.na(default_12m), default_12m := FALSE] - -a3 = a2[,.(loan_id, monthly.rpt.prd, delq.statusn, default_12m)] -setkey(a3, loan_id, monthly.rpt.prd) - -a3[,sum(default_12m)/.N,delq.statusn] - -# create worst delq status last 12 months -system.time(a4 <- a3[, shift(delq.statusn,n=1:12), by=loan_id]) - -eval(parse(text=glue::glue("a4[,worst_delq_last_12m := pmax({paste0(paste0('V', 1:12), collapse=',')}, na.rm=T)]"))) - -a5 = bind_cols(a3, a4[,.(worst_delq_last_12m)]) - -a6 = a5#[delq.statusn < 3,] - -a6[,worst_delq_last_12m_capped := pmin(worst_delq_last_12m, 6)] - -a6[,.N, worst_delq_last_12m_capped] - -a6devid = sample_frac(a6[,.(loan_id = unique(loan_id))], 0.7) - -a6dev = a6[a6devid] -a6val = a6[!a6devid] - -a6devpd = a6dev[,.(pd = sum(default_12m)/.N), worst_delq_last_12m_capped] - -a6val = left_join(a6val, a6devpd, by = "worst_delq_last_12m_capped") -a6dev = left_join(a6dev, a6devpd, by = "worst_delq_last_12m_capped") - -setkey(a6val, loan_id, monthly.rpt.prd) -setkey(a6dev, loan_id, monthly.rpt.prd) - -for_write <- function(a6val) { - a6val1 = a6val[as.Date("2015-06-01") < monthly.rpt.prd & monthly.rpt.prd <= as.Date("2016-06-01") & !is.na(worst_delq_last_12m_capped), ] - a6val1[,N:=.N,loan_id] - - a6val2 = a6val1[N == 12,] - setkey(a6val2, loan_id, monthly.rpt.prd) - - toremove = a6val2[,.SD[12,delq.statusn], loan_id] - - toremove1 = toremove[V1 >= 3, loan_id] - - a6val2 = a6val2[!(loan_id %in% toremove1), ] - - a6val2[order(loan_id), id := rleid(loan_id)] - setkey(a6val2, loan_id, monthly.rpt.prd) - a6val2 -} - - -a6val2 <- for_write(a6val) -a6dev2 <- for_write(a6dev) - - -a6dev2[,.N, delq.statusn] -a6val2[,.N, delq.statusn] - -fwrite(a6val2[,.(id, delq_status = delq.statusn, default_12m)], "val.csv") -fwrite(a6dev2[,.(id, delq_status = delq.statusn, default_12m)], "dev.csv") - - - -gini <- function(a6val) { - setkey(a6val, id, monthly.rpt.prd) - a6val = a6val[seq(12,.N, by=12),] - a6_auc = a6val[order(worst_delq_last_12m_capped), .(bads = sum(default_12m), tots =.N), pd] - - a6_auc[,score:=-pd] - setkey(a6_auc, score) - - a6_auc[order(score), height := bads/sum(bads)] - a6_auc[order(score), width := tots/sum(tots)] - plot(a6_auc[,.(cumsum(width), cumsum(height))], type="l") - abline(0,1) - a6_auc2 = bind_rows(data.table(height=0, width=0), a6_auc) - a6_auc2[,cheight := cumsum(height)] - a6_auc2[,cwidth := cumsum(width)] - 2*a6_auc2[,sum((cheight+lag(cheight))*c(0,diff(cwidth))/2, na.rm=T)]-1 -} - -gini(a6val2) -gini(a6dev2) - diff --git a/tutorials/fannie_mae/4a_read_appl.r b/tutorials/fannie_mae/4a_read_appl.r deleted file mode 100644 index 6edcd593..00000000 --- a/tutorials/fannie_mae/4a_read_appl.r +++ /dev/null @@ -1,57 +0,0 @@ -source("tutorials/fannie_mae/00_setup.r") -library(disk.frame) - - -Acquisitions_Variables = c("LOAN_ID", "ORIG_CHN", "Seller.Name", "ORIG_RT", "ORIG_AMT", "ORIG_TRM", "ORIG_DTE" - ,"FRST_DTE", "OLTV", "OCLTV", "NUM_BO", "DTI", "CSCORE_B", "FTHB_FLG", "PURPOSE", "PROP_TYP" - ,"NUM_UNIT", "OCC_STAT", "STATE", "ZIP_3", "MI_PCT", "Product.Type", "CSCORE_C", "MI_TYPE", "RELOCATION_FLG") %>% tolower() - -Acquisition_ColClasses = c("character", "character", "character", "numeric", "numeric", "integer", "character", "character", "numeric", - "numeric", "character", "numeric", "numeric", "character", "character", "character", "character", "character", - "character", "character", "numeric", "character", "numeric", "numeric", "character") - - - -fs::dir_delete(file.path(outpath, "acq.df")) - -fmdf <- disk.frame(file.path(outpath,"fm_with_harp")) - -pt = proc.time() -acq <- zip_to_disk.frame( - acqzip_file_path, - outdir = file.path(outpath, "acq.df"), - col.names = Acquisitions_Variables, - colClasses = Acquisition_ColClasses, - shardby = "loan_id", - nchunks = nchunks(fmdf)) -timetaken(pt) - -pt = proc.time() -acqall <- rbindlist.disk.frame(acq, outdir = file.path(outpath, "acq_all.df")) -timetaken(pt) - -# took 50 minutes -pt = proc.time() -fmdf_all = left_join( - fmdf, - acqall, - by = "loan_id", - merge_by_chunk_id = T, - outdir = file.path(outpath, "fmdf_appl") - ) -timetaken(pt) - -acqall = disk.frame(file.path(outpath, "acq_all.df")) - -fmdf <- disk.frame(file.path(outpath,"fm_with_harp")) - - -for(i in 1:nchunks(fmdf)) { - print(i) - a1 = get_chunk(acqall,i, keep="loan_id") %>% unique - a2 = get_chunk(fmdf,i, keep="loan_id") %>% unique - stopifnot(nrow(a1) == nrow(a2)) - stopifnot(nrow(setdiff(a1,a2)) == 0) - stopifnot(nrow(setdiff(a2,a1)) == 0) -} - diff --git a/tutorials/fannie_mae/4b_exploratory.R b/tutorials/fannie_mae/4b_exploratory.R deleted file mode 100644 index 1cf4382d..00000000 --- a/tutorials/fannie_mae/4b_exploratory.R +++ /dev/null @@ -1,92 +0,0 @@ -source("inst/fannie_mae/0_setup.r") -library(disk.frame) -#future::plan(multiprocess(workers = availableCores())) - -fmdf_all = disk.frame(file.path(outpath, "fmdf_appl")) - -head(fmdf_all) - -# how many delq.status are there ------------------------------------------ -# system.time(dscnt <- fmdf_all[,.N,delq.status, keep="delq.status"]) -#dscnt[order(delq.status),sum(N),delq.status %>% as.integer] - -system.time(dscnt2 <- fmdf_all %>% - srckeep(c("delq.status", "default_12m", "harp_12m")) %>% - group_by(delq.status, hard = F) %>% - summarise(n = n(), ndef = sum(default_12m, na.rm=T), nharp = sum(harp_12m, na.rm=T)) %>% - collect(parallel = T) %>% - group_by(delq.status) %>% - summarise(n = sum(n), ndef = sum(ndef), nharp = sum(nharp)) %>% - arrange(delq.status)) - - -dscnt2[delq.status %in% c("X","","0","1","2"),.(delq.status, ndef/n)] - -# how many default and go to HARP ----------------------------------------- -# system.time(hd <- fmdf_all[,.N, .(harp_12m,default_12m), keep=c("harp_12m","default_12m")]) -# hd[,.(N = sum(N)), .(harp_12m, default_12m)] - -# 45 seconds -system.time(hd <- fmdf_all %>% - srckeep(c("harp_12m", "default_12m", "delq.status")) %>% - filter(delq.status %in% c("X","","0","1","2")) %>% - group_by(harp_12m, default_12m, hard = F) %>% - summarise(n = n()) %>% - collect(parallel = T) %>% - group_by(harp_12m, default_12m) %>% - summarise(n = sum(n))) - -# Source: local data table [4 x 3] -# Groups: harp_12m -# -# # A tibble: 4 x 3 -# harp_12m default_12m n -# -# 1 NA NA 1765366501 -# 2 TRUE NA 12343954 -# 3 NA TRUE 22366382 -# 4 TRUE TRUE 638 - -# how many accounts eventually default ------------------------------------ -#future::plan(multiprocess(workers = availableCores())) -pt = proc.time() # 4:26 -simple2 = fmdf_all %>% - srckeep(c("oltv", "default_12m", "harp_12m", "loan_id", "orig_dte", "delq.status")) %>% - #filter(delq.status %in% c("0","X","1","2")) %>% - mutate(orig_yr = substr(orig_dte, 4,7)) %>% - group_by(oltv, loan_id, orig_yr, hard = F) %>% - summarise(ndef = sum(default_12m, na.rm=T), n = n()) %>% - collect(parallel = T) %>% - group_by(oltv, loan_id, orig_yr) %>% - summarise(ndef = sum(ndef), n = sum(n)) -timetaken(pt) - -stopifnot(simple2[,n_distinct(loan_id) == .N]) - -#simple3 = simple2[,.(ndef = sum(ndef > 0), .N), .(orig_yr_band = cut(as.numeric(orig_yr),c(-Inf, 2005, 2007, 2008,2009,Inf)), oltv_band = ceiling(oltv/10)*10)] - -simple3 = simple2[,.(ndef = sum(ndef > 0), .N), .(orig_yr, oltv_band = cut(oltv, c(-Inf, seq(0,80,by=20),Inf)))] - -simple3 %>% - mutate(odr = ndef/N) %>% - select(oltv_band, odr, orig_yr) %>% - spread(key = oltv_band, value = odr) - -simple3 %>% - filter(!is.na(oltv_band)) %>% - mutate(odr = ndef/N) %>% - arrange(oltv_band) %>% - ggplot + - geom_bar( - aes( - x = as.factor(oltv_band), - weight = odr, - colour = orig_yr), - position = "dodge") - - - - - - - diff --git a/tutorials/fannie_mae/4c_explore_xgboost.r b/tutorials/fannie_mae/4c_explore_xgboost.r deleted file mode 100644 index f6a53cba..00000000 --- a/tutorials/fannie_mae/4c_explore_xgboost.r +++ /dev/null @@ -1,44 +0,0 @@ -source("inst/fannie_mae/0_setup.r") -library(disk.frame) - -vars_to_keep = c( - Performance_Variables[1:15] %>% tolower, - Acquisitions_Variables[-1] %>% tolower, - c("default_12m", "harp_12m") -) %>% unique - -# create a new variable dh12m which is a concatenation of default in next 12 months or going into harp next 12 months -fmdf_all = disk.frame(file.path(outpath, "fmdf_appl")) %>% - srckeep(vars_to_keep) %>% - delayed(~{ - .x[,dh12m := default_12m | harp_12m] - .x[is.na(dh12m), dh12m:=F] - .x - }) - -fmdf1 = disk.frame(file.path(outpath, "fmdf_appl")) - -# took 2 mins -system.time(uid <- unique(fmdf1[substr(orig_dte ,4,7) >= "2014",unique(loan_id), keep=c("loan_id","orig_dte")])) - -# set.seed(1) -# suid = sample(uid, length(uid)/100) - -uiddf = data.table(loan_id = uid) -setkey(uiddf, loan_id) - -#fmdf_all = disk.frame(file.path(outpath, "fmdf_appl")) -# took about 10 minutes -system.time(fmdf_2yr <- fmdf_all %>% - map.disk.frame(~{ - setkey(.x, loan_id) - merge(.x, uiddf, by="loan_id") - }, - lazy = F, - outdir = file.path(outpath,"fmdf_2yr"), - overwrite = T)) - -# fmdf_2yr = disk.frame(file.path(outpath,"fmdf_2yr")) -system.time(sfmdf_2yr <- sample_frac(fmdf_2yr, 1) %>% collect(parallel = F)) - - diff --git a/tutorials/fannie_mae/4d_xgboost.r b/tutorials/fannie_mae/4d_xgboost.r deleted file mode 100644 index 248e676a..00000000 --- a/tutorials/fannie_mae/4d_xgboost.r +++ /dev/null @@ -1,151 +0,0 @@ -source("inst/fannie_mae/0_setup.r") -library(disk.frame) -fmdf_2yr = disk.frame(file.path(outpath,"fmdf_2yr")) - -system.time(xy <- fmdf_2yr[,.(dh12m, oltv),keep=c("dh12m","oltv")]) - - -# show how to binning ----------------------------------------------------- -system.time(xy <- fmdf_2yr[,c("dh12m", "oltv"), keep=c("dh12m","oltv")]) - -dtrain <- xgb.DMatrix(label = xy$dh12m, data = as.matrix(xy[,"oltv"])) - -pt = proc.time() -m <- xgboost( - data=dtrain, - nrounds = 1, - objective = "binary:logitraw", - tree_method="exact", - monotone_constraints = 1, - base_score = 32662/10067411) -timetaken(pt) - - -map_chr(xgb.dump(m), ~str_extract(.x,"\\[f0<[\\d]+\\.[\\d]+\\]")) %>% - keep(~!is.na(.x)) %>% - map_dbl(~str_extract(.x, "[\\d]+\\.[\\d]+") %>% as.numeric) %>% - sort %>% - floor -> xy[,"oltv"] - -bb = xy[,sum(dh12m)/.N, .(bins = cut(oltv,c(-Inf,bins,Inf)))] - -setkey(bb, bins) - -barplot(height = bb$V1, names.arg = bb$bins) - -pt = proc.time() -m2 <- xgboost( - data=x, - label = y, - nrounds = 3, - objective = "binary:logitraw", - tree_method="exact", - monotone_constraints = 1, - base_score = 32662/10067411) -timetaken(pt) - - -map_chr(xgb.dump(m2), ~str_extract(.x,"\\[f0<[\\d]+\\.[\\d]+\\]")) %>% - keep(~!is.na(.x)) %>% - map_dbl(~str_extract(.x, "[\\d]+\\.[\\d]+") %>% as.numeric) %>% - sort %>% - floor %>% - unique -> bins - -bb = xy[,sum(dh12m)/.N, .(bins = cut(oltv,c(-Inf,bins,Inf)))] - -setkey(bb, bins) - -barplot(height = bb$V1, names.arg = bb$bins) - -print(length(bins)) - -xgb.plot.tree(model = m) - - -# with penalty ------------------------------------------------------------ -system.time(xy <- fmdf_2yr[,.(dh12m, oltv),keep=c("dh12m","oltv")]) - -dtrain <- xgb.DMatrix(label = xy$dh12m, data = as.matrix(xy$oltv)) - -system.time(xycv <- - xgb.cv(dtrain, objective="binary:logitraw", nfold = 5, nrounds=1)) -xycv - -pt = proc.time() -m <- xgboost( - data=dtrain, - nrounds = 1, - objective = "binary:logitraw", - tree_method="exact", - monotone_constraints = 1, - base_score = 32662/10067411 -) -timetaken(pt) - - -map_chr(xgb.dump(m), ~str_extract(.x,"\\[f0<[\\d]+\\.[\\d]+\\]")) %>% - keep(~!is.na(.x)) %>% - map_dbl(~str_extract(.x, "[\\d]+\\.[\\d]+") %>% as.numeric) %>% - sort %>% - floor -> bins - -bb = xy[,sum(dh12m)/.N, .(bins = cut(oltv,c(-Inf,bins,Inf)))] - -setkey(bb, bins) - -barplot(height = bb$V1, names.arg = bb$bins) - -xgb.plot.tree(model = m) - -# do a variable with NA --------------------------------------------------- -system.time(xy <- fmdf_2yr[,.(dh12m, mi_pct),keep=c("dh12m","mi_pct")]) - -dtrain <- xgb.DMatrix(label = xy$dh12m, data = as.matrix(xy$mi_pct)) - -pt = proc.time() -m <- xgboost( - data= dtrain, - nrounds = 1, - objective = "binary:logitraw", - tree_method="exact", - monotone_constraints = 1, - base_score = 32662/10067411) -timetaken(pt) - - -map_chr(xgb.dump(m), ~str_extract(.x,"\\[f0<[\\d]+\\.[\\d]+\\]")) %>% - keep(~!is.na(.x)) %>% - map_dbl(~str_extract(.x, "[\\d]+[\\.[\\d]+]") %>% as.numeric) %>% - sort %>% - floor %>% - unique -> bins - -bb = xy[,sum(dh12m)/.N, .(bins = cut(mi_pct,c(-Inf,bins,Inf)) %>% addNA)] - -setkey(bb, bins) - -barplot(height = bb$V1, names.arg = bb$bins) - - -# do other stuff ---------------------------------------------------------- - - -# took 142 -a = select_if(as.tibble(sfmdf_2yr), is.numeric) %>% - select(-c(mi_type)) - -system.time(x <- as.matrix(a)) - - -dtrain <- xgb.DMatrix(label = sfmdf_2yr$dh12m, data = x) - - -xgboost( - data=x, - label = sfmdf_2yr$dh12m, - nrounds = 2, - objective = "binary:logitraw", - tree_method="exact") - - diff --git a/tutorials/fannie_mae/5a_appl_model.r b/tutorials/fannie_mae/5a_appl_model.r deleted file mode 100644 index 2c7d97e1..00000000 --- a/tutorials/fannie_mae/5a_appl_model.r +++ /dev/null @@ -1,53 +0,0 @@ -source("tutorials/fannie_mae/00_setup.r") -library(disk.frame) - -defaults = disk.frame(file.path(outpath,"defaults.df")) %>% - srckeep(c("loan_id", "start_date")) %>% - collect - -defaults[,first_default_date := start_date] -month(defaults$first_default_date) <- month(defaults$first_default_date)+12 - -defaults = defaults[,.(loan_id, first_default_date)] - -acqall = disk.frame(file.path(outpath,"acq_all.df")) - -# 1:14 -pt <- proc.time() -acqall1 <- left_join( - acqall, - defaults, - by = "loan_id") %>% - delayed(~{ - library(lubridate) - .x[ - !is.na(first_default_date), - mths_to_1st_default := interval( - as.Date(paste0("01/", frst_dte),"%d/%m/%Y"), - first_default_date) - %/% months(1)] - - .x[,default_next_12m := F] - .x[!is.na(first_default_date), default_next_12m := mths_to_1st_default <= 12] - - res = .x[substr(frst_dte,4,7) < 2016, ] - - res - }) %>% - compute(outdir = file.path(outpath, "appl_mdl_data"), overwrite = T) -timetaken(pt) - -# default rate by year of origination -system.time(drbyyr <- acqall1 %>% - srckeep(c("frst_dte", "default_next_12m")) %>% - mutate(frst_yr = substr(frst_dte, 4, 7)) %>% - group_by(frst_yr, hard = F) %>% - summarise(ndef = sum(default_next_12m, na.rm=T), n = n()) %>% - collect(parallel = T) %>% - group_by(frst_yr) %>% - summarise(ndef = sum(ndef), n = sum(n)) %>% - mutate(odr = ndef/n)) - -drbyyr %>% - ggplot + - geom_line(aes(x = frst_yr %>% as.numeric, y = odr)) diff --git a/tutorials/fannie_mae/5b_one_var.r b/tutorials/fannie_mae/5b_one_var.r deleted file mode 100644 index f95f221d..00000000 --- a/tutorials/fannie_mae/5b_one_var.r +++ /dev/null @@ -1,189 +0,0 @@ -source("tutorials/fannie_mae/00_setup.r") -library(disk.frame) - -acqall1 = disk.frame(file.path(outpath, "appl_mdl_data")) - -# XGBoost on one var ------------------------------------------------------ -system.time(xy <- acqall1[,c("default_next_12m", "oltv"), keep=c("default_next_12m", "oltv")]) - -# transform into format accepted by XGBoost -dtrain <- xgb.DMatrix(label = xy$default_next_12m, data = as.matrix(xy[,"oltv"])) - -portfolio_default_rate = xy[,sum(default_next_12m, na.rm = T)/.N] - -pt = proc.time() -m <- xgboost( - data=dtrain, - nrounds = 1, - objective = "binary:logitraw", - tree_method="exact", - monotone_constraints = 1, - base_score = portfolio_default_rate) -timetaken(pt) - -prev_pred = predict(m, dtrain) - -map_chr(xgb.dump(m), ~str_extract(.x,"\\[f0<[\\d]+\\.[\\d]+\\]")) %>% - keep(~!is.na(.x)) %>% - map_dbl(~str_extract(.x, "[\\d]+\\.[\\d]+") %>% as.numeric) %>% - sort %>% - floor -> bins - -bb = xy[,.(ndef = sum(default_next_12m), .N, m1 = min(oltv), m2 = max(oltv)), .(bins = cut(oltv,c(-Inf,bins,Inf)))] -new_bins = sort(unique( bb$m2)) -bb = xy[,.(ndef = sum(default_next_12m), .N), .(bins = cut(oltv,c(-Inf,new_bins,Inf)))][order(bins)] - -setkey(bb, bins) -bb %>% - filter(!is.na(bins)) %>% - mutate(`Orig LTV Band` = bins, `Default Rate%` = ndef/N) %>% - ggplot + - geom_bar(aes(x = `Orig LTV Band`, y = `Default Rate%`), stat = 'identity') + - coord_flip() - -if(F) { - system.time(xyz <- acqall1[,c("default_next_12m", "oltv", "frst_dte"), keep=c("default_next_12m", "oltv", "frst_dte")]) - - xyz[,frst_yr := substr(frst_dte,4,7) %>% as.integer] - xyz[,frst_dte:=NULL] - - bb = xyz[,.(ndef = sum(default_next_12m), .N), .(bins = cut(oltv,c(-Inf,bins,Inf)), frst_yr)] - - bb[,binton := sum(N), bins] - bb[,dr := ndef/binton] - - setkey(bb, bins) - bb[order(frst_yr,decreasing = T),text_y_pos := cumsum(dr) - dr/2, bins] - bb[,text := substr(frst_yr, 3,4)] - - bb %>% - filter(!is.na(bins)) %>% - mutate(`Yr of Orig` = as.factor(frst_yr), `Orig LTV Band` = bins, `Default Rate%` = dr) %>% - ggplot + - geom_bar(aes(x = `Orig LTV Band`, y = `Default Rate%`, fill = `Yr of Orig`), stat = 'identity') + - geom_text(aes(x = `Orig LTV Band`, y = text_y_pos, label = text)) + - coord_flip() -} - - -# what if we fit them in rounded number --------------------------------------------------------- - -xy[oltv > 80, oltv_round := ceiling(oltv/5)*5] -xy[oltv <= 80, oltv_round := ceiling(oltv/10)*10] -xy[oltv <= 40, oltv_round := ceiling(oltv/20)*20] -#xy = xy[!is.na(oltv_round),] - -xy[is.na(oltv_round),] - -xy[,.N, oltv_round] - -# transform into format accepted by XGBoost -dtrain <- xgb.DMatrix(label = xy$default_next_12m, data = as.matrix(xy[,"oltv_round"])) - -pt = proc.time() -m <- xgboost( - data=dtrain, - nrounds = 1, - objective = "binary:logitraw", - tree_method="exact", - monotone_constraints = 1, - base_score = portfolio_default_rate) -timetaken(pt) - -map_chr(xgb.dump(m), ~str_extract(.x,"\\[f0<[\\d]+[\\.]{0,1}[\\d]+\\]")) %>% - keep(~!is.na(.x)) %>% - map_dbl(~str_extract(.x, "[\\d]+[\\.]{0,1}[\\d]+") %>% as.numeric) %>% - sort -> bins - -bb = xy[,.(ndef = sum(default_next_12m), .N, m1 = min(oltv_round), m2 = max(oltv_round)), .(bins = cut(oltv_round,c(-Inf,bins,Inf)))] -new_bins = sort(unique( bb$m2)) -bb = xy[,.(ndef = sum(default_next_12m), .N), .(bins = cut(oltv_round,c(-Inf,new_bins,Inf)))][order(bins)] -bb[,odr := ndef/N] - -setkey(bb, bins) -bb %>% - filter(!is.na(bins)) %>% - mutate(`Orig LTV Band` = bins, `Default Rate%` = ndef/N) %>% - ggplot + - geom_bar(aes(x = `Orig LTV Band`, y = `Default Rate%`), stat = 'identity') + - coord_flip() - - -prev_pred = predict(m, dtrain) - -prev_pred1 = predict(m, dtrain, predcontrib=T) - -# add another variable ---------------------------------------------------- -target = "default_next_12m" -feature = "orig_amt" -df = acqall1 -format_fn = base::I -existing_model = prev_pred -monotone_constraints = -1 - -auc <- function(target, score) { - df = data.table(target, score) - - df1 = df[,.(nt = sum(target), n = .N, score)] - setkey(df1, score) -} - -add_var_to_scorecard <- function(df, target, feature, monotone_constraints = 0, prev_pred = NULL, format_fn = base::I) { - - xy = df %>% - srckeep(c(target, feature)) %>% - collect(parallel = T) - - # evaluate - code = glue::glue("xy = xy %>% mutate({feature} = format_fn({feature}))") - eval(parse(text = code)) - - dtrain <- xgb.DMatrix(label = xy[,target, with = F][[1]], data = as.matrix(xy[,c(feature), with = F])) - - if(is.null(prev_pred)) { - pt = proc.time() - m2 <- xgboost( - data=dtrain, - nrounds = 1, - objective = "binary:logitraw", - tree_method="exact", - monotone_constraints = monotone_constraints - ) - timetaken(pt) - } else { - setinfo(dtrain, "base_margin", prev_pred) - pt = proc.time() - m2 <- xgboost( - data=dtrain, - nrounds = 1, - objective = "binary:logitraw", - tree_method="exact", - monotone_constraints = monotone_constraints - ) - timetaken(pt) - - a2 = predict(m2, dtrain) - a3 = predict(m2, dtrain, predcontrib = T) - } - - map_chr(xgb.dump(m2), ~str_extract(.x,"\\[f0<[\\d]+[\\.]{0,1}[\\d]+\\]")) %>% - keep(~!is.na(.x)) %>% - map_dbl(~str_extract(.x, "[\\d]+[\\.]{0,1}[\\d]+") %>% as.numeric) %>% - sort -> bins - - code = glue::glue("bb = xy[,.(ndef = sum({target}), .N, m1 = min({feature}), m2 = max({feature})), .(bins = cut({feature},c(-Inf,bins,Inf)))]") - eval(parse(text = code)) - - new_bins = sort(unique(bb$m2)) - code1 = glue::glue("bb = xy[,.(ndef = sum(default_next_12m), .N), .(bins = cut({feature},c(-Inf,new_bins,Inf)))][order(bins)]") - eval(parse(text = code1)) - - setkey(bb, bins) - bb %>% - filter(!is.na(bins)) %>% - mutate(`Orig LTV Band` = bins, `Default Rate%` = ndef/N) %>% - ggplot + - geom_bar(aes(x = `Orig LTV Band`, y = `Default Rate%`), stat = 'identity') + - coord_flip() -} - diff --git a/tutorials/fannie_mae/5c_using_a_fn.r b/tutorials/fannie_mae/5c_using_a_fn.r deleted file mode 100644 index e083fa54..00000000 --- a/tutorials/fannie_mae/5c_using_a_fn.r +++ /dev/null @@ -1,54 +0,0 @@ -source("inst/fannie_mae/0_setup.r") -library(disk.frame) - -acqall1 = disk.frame(file.path(outpath, "appl_mdl_data")) - -pt <- proc.time() -firstm <- add_var_to_scorecard(acqall1, "default_next_12m", "oltv", monotone_constraints = 1, format_fn = function(v) { - ceiling(v / 5) * 5 -}) -timetaken(pt) - -# now add the second variable --------------------------------------------- -pt <- proc.time() -secondm <- add_var_to_scorecard( - acqall1, - "default_next_12m", - "dti", - prev_pred = firstm$prev_pred, - monotone_constraints = 1) -timetaken(pt) - -pt <- proc.time() -thirdm <- add_var_to_scorecard( - acqall1, - "default_next_12m", - "orig_trm", - prev_pred = secondm$prev_pred, - monotone_constraints = 1) -timetaken(pt) - -pt <- proc.time() -fourthm <- add_var_to_scorecard( - acqall1, - "default_next_12m", - "num_bo", - prev_pred = thirdm$prev_pred, - monotone_constraints = -1, - format_fn = as.numeric) -timetaken(pt) - -pt <- proc.time() -fifthm <- add_var_to_scorecard( - acqall1, - "default_next_12m", - "cscore_b", - prev_pred = fourthm$prev_pred, - monotone_constraints = -1) -timetaken(pt) - - -# plot improvement -------------------------------------------------------- -plot(map_dbl(list(firstm, secondm, thirdm, fourthm, fifthm), ~.x$auc), type="b", ylim = c(0.4,0.9)) -abline(h = 0.7, lty = 2) - diff --git a/tutorials/fannie_mae/5d_AutoML.r b/tutorials/fannie_mae/5d_AutoML.r deleted file mode 100644 index 7525c879..00000000 --- a/tutorials/fannie_mae/5d_AutoML.r +++ /dev/null @@ -1,34 +0,0 @@ -source("inst/fannie_mae/0_setup.r") -library(disk.frame) -acqall = disk.frame(file.path(outpath, "appl_mdl_data")) - -pt = proc.time() -acqall_all = acqall %>% - map.disk.frame(~{ - defs = .x[default_next_12m == T,] - defs[,weight:=1] - non_defs = .x[default_next_12m == F, ] - non_defs[,weight:=10] - - rbindlist(list(defs, sample_frac(non_defs, 0.1)), fill = T, use.names = T) - }, lazy = F, outdir = file.path(outpath, "appl_mdl_data_sampled"), overwrite = T) -timetaken(pt) - -#acqall_all = disk.frame(file.path(outpath, "appl_mdl_data_sampled")) - -acqall_dev = sample_frac(acqall_all, 0.7) %>% - write_disk.frame(file.path(outpath, "appl_mdl_data_sampled_dev")) - -#acqall_dev = disk.frame(file.path(outpath, "appl_mdl_data_sampled_dev")) - - -uid_dev = acqall_dev %>% - srckeep("loan_id") %>% - mutate(loan_id = unique(loan_id)) %>% - collect(parallel = T) %>% - mutate(loan_id = unique(loan_id)) - - -acqall_val = acqall_all %>% - anti_join(uid_dev, by = "loan_id") %>% - write_disk.frame(file.path(outpath, "appl_mdl_data_sampled_val")) diff --git a/tutorials/fannie_mae/5e_AutoML_actual.r b/tutorials/fannie_mae/5e_AutoML_actual.r deleted file mode 100644 index 1e591543..00000000 --- a/tutorials/fannie_mae/5e_AutoML_actual.r +++ /dev/null @@ -1,90 +0,0 @@ -source("inst/fannie_mae/0_setup.r") -library(disk.frame) - -acqall_dev = disk.frame(file.path(outpath, "appl_mdl_data_sampled_dev2")) - - -# 5d develop a function to test all variables -check_which_is_best <- function(df, target, features, monotone_constraints, format_fns, weight=NULL) { - - prev_pred = NULL - ws = NULL - vars_scr = NULL - - while(length(features) > 0) { - # try every var - - res = furrr::future_map(1:length(features), ~add_var_to_scorecard( - #res = purrr::map(1:length(features), ~add_var_to_scorecard( - df, - target, - features[.x], - monotone_constraints = monotone_constraints[.x], - prev_pred = prev_pred, - format_fn = format_fns[[.x]], - weight, - save_model_fname = glue::glue("{features[.x]}.xgbm") - )) - - # which has the best auc - w = which.max(map_dbl(res, ~.x$auc)) - ws = c(ws, w) - feature_to_fit = features[w] - print(glue::glue("choosen: {feature_to_fit}")) - - var_scr1 = res[[w]] - eval(parse(text = glue::glue("vars_scr = c(vars_scr, list({feature_to_fit} = var_scr1))"))) - - features = features[-w] - format_fns = format_fns[-w] - monotone_constraints = monotone_constraints[-w] - - prev_pred = var_scr1$prev_pred - } - vars_scr -} - -num_vars = c("mi_pct", "orig_amt", "orig_rt", "ocltv", "mi_type", "cscore_c", "oltv", "dti", "orig_trm", "cscore_b", "num_bo", "num_unit", "orig_dte", "frst_dte") -num_vars_mon = c(0 , 1 , 1 , 1 , 0 , -1 , 1 , 1 , 1 , -1 , -1 , 0 , 0 , 0) -num_var_fmt_fn = c(map(1:10, ~base::I), map(1:2, ~as.numeric), map(1:2, ~function(x) { - as.numeric(substr(x, 4, 7)) -})) - -cat_vars = setdiff(names(acqall_dev), num_vars) %>% - setdiff( - c("loan_id", "default_next_12m", "first_default_date", "mths_to_1st_default", "zip_3", "weight")) - -pt = proc.time() -res_all = check_which_is_best( - acqall_dev, - "default_next_12m", - features = c(num_vars, cat_vars), - monotone_constraints = c(num_vars_mon, rep(0, length(cat_vars))), - format_fns = c(num_var_fmt_fn, map(1:length(cat_vars), ~base::I)), - weight = "weight" -) -timetaken(pt) - -plot(map_dbl(res_all, ~.x$auc)) - -saveRDS(res_all, "model.rds") - -if(F) { - rescat = check_which_is_best( - acqall_dev, - "default_next_12m", - c("seller.name", "oltv", "state", "dti", "mi_type"), - c(0 , 1 , 0 , 1 , 0 ), - map(1:5, ~base::I) - ) - - - system.time(res <- check_which_is_best( - acqall1, - "default_next_12m", - c("ocltv", "cscore_c", "mi_type", "oltv", "dti", "orig_trm", "cscore_b", "num_bo"), - c(1 , -1 , 0 , 1 , 1 , 1 , -1 , -1 ), - c(map(1:7, ~base::I), list(as.numeric)))) -} - - diff --git a/tutorials/fannie_mae/5f_make_scorecard_visible.R b/tutorials/fannie_mae/5f_make_scorecard_visible.R deleted file mode 100644 index 2d696a29..00000000 --- a/tutorials/fannie_mae/5f_make_scorecard_visible.R +++ /dev/null @@ -1,94 +0,0 @@ -source("inst/fannie_mae/0_setup.r") -library(disk.frame) - -acqall_val = disk.frame(file.path(outpath, "appl_mdl_data_sampled_val")) - -#the model -mdl = readRDS("model.rds") - -plot(purrr::map_dbl(mdl, ~.x$auc)) - -#' score one variable -score_one_var <- function(feature, x, bins, bias, format_fn = base::I) { - print(feature) - # numeric features (post format_fn) has only two columns in bins - if(ncol(bins) == 2) { - x1 = format_fn(x) - binsNA = evalparseglue("bins[is.na({feature}),]") - bins = evalparseglue("bins[!is.na({feature}),]") - score = bins$score[cut(x1, c(-Inf, bins[[2]], Inf))] - - if(nrow(binsNA) == 1) { - score[is.na(x1)] = binsNA[[1]] - } - } else { - x1 = format_fn(x) - x1df = evalparseglue("data.table({feature} = x1)") - x1df_scr = evalparseglue("merge(x1df, bins, by = '{feature}', all.x=T)") - score = x1df_scr$score - } - # if there are still na then assign the bias - #browser(expr = any(is.na(score))) - if(any(is.na(score))) { - warning(glue::glue("the feature `{feature}` has {sum(is.na(score))} unassigned scores out of {length(score)}, assigning them the bias = {bias}")) - score[is.na(score)] = bias - } - - score -} - - -#' Socre a scorecard -#' @param mdl the xg scorecard model -#' @param df a disk.frame -score_xg_scorecard <- function(df, mdl) { - res = furrr::future_map_dfc(mdl, ~{ - #res = purrr::map_dfc(mdl, ~{ - var = df %>% - srckeep(.x$feature) %>% - collect(parallel = F) - x = var[[1]] - feature = .x$feature - bins = .x$bins - bias = .x$bias - format_fn = .x$format_fn - score_one_var(feature, x, bins, bias, format_fn) - }) %>% rowSums - res -} - -plot_auc <- function(df) { - target = df %>% - srckeep("default_next_12m") %>% - collect(parallel = F) - - pt = proc.time() - target$score = score_xg_scorecard(df, mdl) - timetaken(pt) - - AUC = auc(target$default_next_12m, target$score) - GINI = 2*AUC-1 - - target[,negscore := -score] - setkey(target, negscore) - cts = target[,.(ctot = (1:.N)/.N, cbad = cumsum(default_next_12m)/sum(default_next_12m))] - - plot(cts[seq(1,.N, length.out=100),], type="l", xlim=c(0,1), - main=glue::glue("On Validation - AUC: {AUC %>% round(2)}; GINI: {GINI %>% round(2)}")) - abline(a=0,b=1,lty=2) - abline(h=0) - abline(v=0) -} - -system.time(plot_auc(acqall_val)) - -# score on whole ---------------------------------------------------------- -acqall_dev = disk.frame(file.path(outpath, "appl_mdl_data_sampled_dev")) -system.time(plot_auc(acqall_dev)) - - -# score on whole ---------------------------------------------------------- -acqall = disk.frame(file.path(outpath, "appl_mdl_data")) -system.time(plot_auc(acqall)) - - diff --git a/tutorials/fannie_mae/6a_origination_year.r b/tutorials/fannie_mae/6a_origination_year.r deleted file mode 100644 index 8115a782..00000000 --- a/tutorials/fannie_mae/6a_origination_year.r +++ /dev/null @@ -1,31 +0,0 @@ - -acqall2 = acqall1 %>% - srckeep(c("orig_dte", "first_default_date")) %>% - mutate(orig_yr = substr(orig_dte, 4,7), yr_1st_d = year(first_default_date)) %>% - group_by(orig_yr, yr_1st_d, hard = F) %>% - summarise(n=n()) %>% - collect %>% - group_by(orig_yr, yr_1st_d) %>% - summarise(n = sum(n)) - -acqall2[,tot_n := sum(n), orig_yr] - -acqall3 <- acqall2[!is.na(yr_1st_d),] - -acqall3[,dr := n/tot_n] - -acqall3 %>% - filter(orig_yr > 1999) %>% - mutate(`Origination Year` = orig_yr) %>% - ggplot + - geom_line(aes(x = yr_1st_d, y = dr, colour = `Origination Year`)) + - xlab("Year of Observation") + - ylab("Ratio of defaulted accounts vs # of accts at orig") + - scale_x_continuous(breaks=2000:2017, labels=as.character(2000:2017)) + - scale_y_continuous(expand = c(0, 0)) + - ggtitle("Fannie Mae Single Family Loans: Ratio of defaults vs # of accounts in same year of origination") - - - - - diff --git a/tutorials/fannie_mae/7a_speedglm.r b/tutorials/fannie_mae/7a_speedglm.r deleted file mode 100644 index 974023bb..00000000 --- a/tutorials/fannie_mae/7a_speedglm.r +++ /dev/null @@ -1,75 +0,0 @@ -source("inst/fannie_mae/0_setup.r") -library(disk.frame) - -acqall_dev = disk.frame(file.path(outpath, "appl_mdl_data_sampled_dev2")) - -library(speedglm) - -head(acqall_dev) - -#' A streaming function for speedglm -#' @param df a disk.frame -stream_shglm <- function(df) { - i = 0 - is = sample(nchunks(df), replace = F) - function(reset = F) { - - if(reset) { - print("you've reset") - i <<- 0 - } else { - i <<- i + 1 - #if (i > 4) { - if (i > nchunks(df)) { - return(NULL) - } - print(glue::glue("{i}/{nchunks(df)}")) - return(get_chunk(df, is[i])) - #return(get_chunk(df, i)) - } - } -} - -acqall_dev1 = acqall_dev %>% delayed(~{ - .x[,oltv_band := cut(oltv, c(-Inf, seq(0,100,by=10)))] - .x[,dti_band := addNA(cut(dti, c(-Inf, seq(0,100,by=10), Inf)))] - .x -}) - -library(speedglm) -library(biglm) - -shglm(default_next_12m ~ oltv_band-1, - datafun = streamacq, family=binomial()) - -bigglm(default_next_12m ~ oltv_band-1, data = streamacq, family=binomial()) - -make.data<-function(filename, chunksize,...){ - conn<-NULL - function(reset=FALSE){ - if(reset){ - if(!is.null(conn)) - close(conn) - conn<<-file(filename,open="r")} else{ - rval<-read.table(conn, nrows=chunksize,...) - if ((nrow(rval)==0)) { - close(conn) - conn<<-NULL - rval<-NULL} - return(rval)}}} -# data1 is a small toy dataset -data(data1) -write.table(data1,"data1.txt",row.names=FALSE,col.names=FALSE) -rm(data1) -da<-make.data("data1.txt",chunksize=50,col.names=c("y","fat1","x1","x2")) -# Caution! make sure to close the connection once you have run command #1 -da(reset=T) #1: opens the connection to "data1.txt" -da(reset=F) #2: reads the first 50 rows (out of 100) of the dataset -da(reset=F) #3: reads the second 50 rows (out of 100) of the dataset -da(reset=F) #4: is NULL: this latter command closes the connectionrequire(biglm)# fat1 is a factor with four levels - -b1<-shglm(y~factor(fat1)+x1,weights=~I(x2^2),datafun=da,family=Gamma(log)) -b2<-bigglm(y~factor(fat1)+x1,weights=~I(x2^2),data=da,family=Gamma(log)) -summary(b1) -summary(b2) - diff --git a/tutorials/fannie_mae/8a_keras.r b/tutorials/fannie_mae/8a_keras.r deleted file mode 100644 index ef2e1486..00000000 --- a/tutorials/fannie_mae/8a_keras.r +++ /dev/null @@ -1,153 +0,0 @@ -source("inst/fannie_mae/0_setup.r") -library(disk.frame) -library(keras) - -acqall_dev = disk.frame(file.path(outpath, "appl_mdl_data_sampled_dev")) - -#' A streaming function for speedglm -#' @param df a disk.frame -stream_shglm <- function(df) { - i = 0 - is = sample(nchunks(df), replace = F) - function(reset = F) { - - if(reset) { - print("you've reset") - i <<- 0 - } else { - i <<- i + 1 - #if (i > 4) { - if (i > nchunks(df)) { - return(NULL) - } - print(glue::glue("{i}/{nchunks(df)}")) - return(get_chunk(df, is[i])) - #return(get_chunk(df, i)) - } - } -} - -acqall_dev1 = acqall_dev %>% delayed(~{ - .x[,oltv_band := cut(oltv, c(-Inf, seq(40,100,by=20)))] - .x[,dti_band := addNA(cut(dti, c(-Inf, seq(0,64,by=8))))] - .x -}) - - -build_model <- function() { - model <- keras_model_sequential() %>% - layer_dense(units = 2, activation = 'softmax') - - - model %>% compile( - loss = "categorical_crossentropy", - optimizer = 'sgd' - ) - - model -} - -model = build_model() - -ol = levels(acqall_dev1[,oltv_band, keep=c("oltv","dti")]) -dl = levels(acqall_dev1[,dti_band, keep=c("oltv","dti")]) - -#for(i in 1:nchunks(acqall_dev1)) { -kk <- function() { - j = 0 - done = F - ii = 0 - while(!done) { - j <- j + 1 - si = sample(nchunks(acqall_dev1), nchunks(acqall_dev1)*0.7) - osi = setdiff(1:nchunks(acqall_dev1), si) - - system.time(a <- map_dfr(si, ~{ - ii <- ii + 1 - i = .x - if(ii %% 20 == 0) print(glue::glue("{j}:{ii} {Sys.time()}")) - - a = get_chunk(acqall_dev1, i) - a[,.(sum(default_next_12m), .N),oltv_band] - - a1 = - cbind( - a[,keras::to_categorical(as.integer(oltv_band) - 1)]#, - #a[,keras::to_categorical(as.integer(dti_band) - 1)] - ) - - at = a[,default_next_12m*1] - Y_train = keras::to_categorical(at) - - hist = model %>% fit( - a1, - Y_train, - epochs = 1, - validation_split = 0.2, - verbose = 0 - ) - - gw = get_weights(model) - - gwdt = as.data.table(gw[1]) - setnames(gwdt, names(gwdt), c("non_default", "default")) - gwdt$i =i - gwdt$var = c( - rep("oltv_band", length(ol)) - #, rep("dti_band", length(dl)+1) - ) - gwdt$band = c(ol#,dl - ) - gwdt - })) - - some_chunks <- map(osi, ~{ - i = .x - a = get_chunk(acqall_dev1, i) - - a1 = - cbind( - a[,keras::to_categorical(as.integer(oltv_band) -1)] - #,a[,keras::to_categorical(dti_band %>% as.integer-1)] - ) - a1 - }) %>% reduce(rbind) - - outcomes <- map(osi, ~{ - i = .x - a = get_chunk(acqall_dev, i, keep=c("default_next_12m")) - a[,default_next_12m] - }) %>% unlist - - auc= auc(outcomes, predict(model, some_chunks)[,2]) - - # scores - p = predict(model, diag(4))[,2] - a_b = log(p/(1-p)) - - scalar = 20/log(2) - intercept = 20*log(536870912/15)/log(2) - - a = mean(a_b) - b = a_b - a - done <- length(unique(sign(diff(b)))) == 1 - - scrs = data.table(base_scr = round(a*scalar+intercept,0), scr = round(-b*scalar,0)) - scrs$var = c(rep("oltv_band", length(ol)) - #, rep("dti_band", length(dl)+1) - ) - scrs$band = c(ol - #, dl - ) - scrs = scrs[band != "bias"] - - scrs = scrs[,.(var, band, scr, base_scr)] - print(glue::glue("AUC: {auc}")) - print(scrs) - } -} - -pt = proc.time() -kk() -timetaken(pt) -View(scrs) diff --git a/tutorials/fannie_mae_10pct/00_setup.r b/tutorials/fannie_mae_10pct/00_setup.r deleted file mode 100644 index 23df1718..00000000 --- a/tutorials/fannie_mae_10pct/00_setup.r +++ /dev/null @@ -1,44 +0,0 @@ -library(glue) -library(purrr) -library(fst) -library(tidyr) -library(ggplot2) -library(stringr) -library(xgboost) -library(lubridate) -library(disk.frame) - -auc = disk.frame:::auc -add_var_to_scorecard = disk.frame:::add_var_to_scorecard - -raw_perf_data_path = "C:/data/Performance_All/" -#raw_perf_data_path = "d:/data/Performance_All" - -# where the outputs go -outpath = "c:/data/fannie_mae_disk_frame/" -#outpath = "d:/data/fannie_mae_disk_frame_small/" - -Performance_ColClasses = - c("character", "character", "character", "numeric", "numeric", "numeric", "numeric", - "numeric", "character", "character", "character", "character", "character", "character", - "character", "character", "character", "numeric", "numeric", "numeric", "numeric", "numeric", - "numeric", "numeric", "numeric", "numeric", "numeric", "numeric", "character", "numeric", - "character") -Performance_Variables = - c("LOAN_ID", "Monthly.Rpt.Prd", "Servicer.Name", "LAST_RT", "LAST_UPB", "Loan.Age", - "Months.To.Legal.Mat" , "Adj.Month.To.Mat", "Maturity.Date", "MSA", "Delq.Status", - "MOD_FLAG", "Zero.Bal.Code", "ZB_DTE", "LPI_DTE", "FCC_DTE","DISP_DT", "FCC_COST", - "PP_COST", "AR_COST", "IE_COST", "TAX_COST", "NS_PROCS", "CE_PROCS", "RMW_PROCS", - "O_PROCS", "NON_INT_UPB", "PRIN_FORG_UPB_FHFA", "REPCH_FLAG", "PRIN_FORG_UPB_OTH", - "TRANSFER_FLG") %>% tolower - -dfiles = dir(raw_perf_data_path, full.names = T) -short_dfiles = dir(raw_perf_data_path) - -Acquisitions_Variables = c("LOAN_ID", "ORIG_CHN", "Seller.Name", "ORIG_RT", "ORIG_AMT", "ORIG_TRM", "ORIG_DTE" - ,"FRST_DTE", "OLTV", "OCLTV", "NUM_BO", "DTI", "CSCORE_B", "FTHB_FLG", "PURPOSE", "PROP_TYP" - ,"NUM_UNIT", "OCC_STAT", "STATE", "ZIP_3", "MI_PCT", "Product.Type", "CSCORE_C", "MI_TYPE", "RELOCATION_FLG") %>% tolower() - -Acquisition_ColClasses = c("character", "character", "character", "numeric", "numeric", "integer", "character", "character", "numeric", - "numeric", "character", "numeric", "numeric", "character", "character", "character", "character", "character", - "character", "character", "numeric", "character", "numeric", "numeric", "character") diff --git a/tutorials/fannie_mae_10pct/01a_read_from_csv.r b/tutorials/fannie_mae_10pct/01a_read_from_csv.r deleted file mode 100644 index abfe3062..00000000 --- a/tutorials/fannie_mae_10pct/01a_read_from_csv.r +++ /dev/null @@ -1,47 +0,0 @@ -source("inst/fannie_mae_10pct/00_setup.r") - -# number of rows to read in from each file in one go -nreadin = 1e7 - -# compression ratio, max = 100 for best compression but slower running speed -#compress = 50 - -# set up some variable for future use -relative_file_path = dir(raw_perf_data_path) -full_file_path = dir(raw_perf_data_path, full.names = T) - -file_sizes = purrr::map_dbl(full_file_path, ~file.size(.x)) - -# use the recommend_nchunks function to get a chunksize based on your RAM and -# number of CPU cores -nchunks = sum(file_sizes) %>% recommend_nchunks(type="csv") - -# order the order of conversion by prioritising the largest files first. Because -# handling the largest files are the most difficult, and if an error occurs it -# is more likely to occur when converting large files, hence this will allow us -# to identify errors early -relative_file_path = relative_file_path[order(file_sizes, decreasing = T)] -full_file_path = full_file_path[order(file_sizes, decreasing = T)] - -l = length(full_file_path) - -# convert CSV in parallel -pt <- proc.time() -res = future.apply::future_mapply(function(relative_file_pathi, full_file_pathi) { -#res = mapply(function(relative_file_pathi, full_file_pathi) { - relative_file_pathi # for glue - df = csv_to_disk.frame( - full_file_pathi, - file.path(outpath, glue("raw_fannie_mae/{relative_file_pathi}")), - shardby = "loan_id", - nchunks = nchunks, - colClasses = Performance_ColClasses, - col.names = Performance_Variables, - sep = "|", - #compress = compress, - in_chunk_size = nreadin, - overwrite = T) - - df -}, relative_file_path, full_file_path, SIMPLIFY = F) -print(timetaken(pt)) diff --git a/tutorials/fannie_mae_10pct/01d_a_rbind_all_data_together.r b/tutorials/fannie_mae_10pct/01d_a_rbind_all_data_together.r deleted file mode 100644 index 78ccf3e1..00000000 --- a/tutorials/fannie_mae_10pct/01d_a_rbind_all_data_together.r +++ /dev/null @@ -1,31 +0,0 @@ -source("inst/fannie_mae_10pct/00_setup.r") - -disk.frame_folders = dir(file.path(outpath, "raw_fannie_mae"),full.names = T) -list_of_disk.frames <- purrr::map(disk.frame_folders, disk.frame) - -pt <- proc.time() -fmdf = rbindlist.disk.frame( - list_of_disk.frames, - outdir = file.path(outpath, "fm.df"), - by_chunk_id = T, - overwrite = T) -print(timetaken(pt)) - -nrow(fmdf) - -nchunks(fmdf) - -fmdf %>% delayed(~.x[,.(sum(is.na(prin_forg_upb_oth )))] -) %>% dput - - -if(F) { - #239.44 - system.time(uid <- fmdf %>% - srckeep("loan_id") %>% - summarise(uid = unique(loan_id)) %>% - collect) - # the sharding is correct - - n_distinct(uid$uid) -} diff --git a/tutorials/fannie_mae_10pct/01d_b_OPTIONAL_rechunk.r b/tutorials/fannie_mae_10pct/01d_b_OPTIONAL_rechunk.r deleted file mode 100644 index 0a0739fc..00000000 --- a/tutorials/fannie_mae_10pct/01d_b_OPTIONAL_rechunk.r +++ /dev/null @@ -1,16 +0,0 @@ -library(disk.frame) - -fmdf <- disk.frame(file.path(outpath, "fm.df")) - -# rechunk respects the shardkey -system.time(rechunk(fmdf, nchunks(fmdf)*2)) - -nrow(fmdf) - -nchunks(fmdf) - - -if(F) { - system.time(rechunk(fmdf, 6)) -} - diff --git a/tutorials/fannie_mae_10pct/02a_create_forward_looking_flag.r b/tutorials/fannie_mae_10pct/02a_create_forward_looking_flag.r deleted file mode 100644 index 041b3334..00000000 --- a/tutorials/fannie_mae_10pct/02a_create_forward_looking_flag.r +++ /dev/null @@ -1,65 +0,0 @@ -# 2_exploratory.r -source("inst/fannie_mae_10pct/00_setup.r") - -fmdf = disk.frame(file.path(outpath, "fm.df")) - -pt = proc.time() -eg = fmdf %>% - srckeep(c("loan_id", "delq.status")) %>% - group_by(loan_id) %>% - summarise(n = n(), max.delq.status = max(as.integer(delq.status))) %>% - filter(n >= 12, max.delq.status == 3) %>% - delayed(~.x[1,]) %>% - collect -timetaken(pt) - -pt = proc.time() -fmdf_eg = fmdf %>% - srckeep(c("monthly.rpt.prd", "loan_id", "delq.status")) %>% - inner_join(eg, by = "loan_id") %>% - collect %>% - mutate(monthly.rpt.prd = as.Date(monthly.rpt.prd, "%m/%d/%Y")) %>% - arrange(loan_id, monthly.rpt.prd) -timetaken(pt) - -View(fmdf_eg) # see 2nd account - -# no need to hard group by it's already sharded by loan_id -pt <- proc.time() -defaults <- map.disk.frame(fmdf, function(df) { - # create the default flag - df[,date:=as.Date(monthly.rpt.prd,"%m/%d/%Y")] - setkey(df,loan_id, date) - df[,delq.status := as.integer(delq.status)] - df[,default:=delq.status>=3] - df[is.na(default),default:=F] - - df2 = df[default==T,.(loan_id, start_date =date, end_date = date)] - lubridate::month(df2$start_date) <- lubridate::month(df2$start_date)-12 - - # get rid of overlaps - df2[order(start_date),lag_end_date := shift(end_date,1), loan_id] - - df2[,grp_incr := 1] - df2[start_date <= lag_end_date, grp_incr := 0, loan_id] - - df2[,grp:=cumsum(grp_incr)] - df3 = df2[,.(start_date = min(start_date), end_date = max(end_date), default_12m=T),.(loan_id,grp)] - - df3[,grp:=NULL] - - # create the hardship flag - # whether the customer goes into in the next 12 months hardship - df3 - }, - keep = c("monthly.rpt.prd", "delq.status", "loan_id"), - outdir = file.path(outpath, "defaults.df"), - lazy = F, - overwrite = T) -cat(glue::glue("Creating forward looking default flag took {timetaken(pt)}")) - -head(defaults) - -defaults %>% - filter(loan_id == "107150255179") %>% - collect diff --git a/tutorials/fannie_mae_10pct/02a_create_forward_looking_flag_4credit_risk_forum.r b/tutorials/fannie_mae_10pct/02a_create_forward_looking_flag_4credit_risk_forum.r deleted file mode 100644 index e5320e1f..00000000 --- a/tutorials/fannie_mae_10pct/02a_create_forward_looking_flag_4credit_risk_forum.r +++ /dev/null @@ -1,47 +0,0 @@ -# 2_exploratory.r -source("inst/fannie_mae/0_setup.r") -outpath -fmdf = disk.frame(file.path(outpath, "fm_with_harp")) - -nrow(fmdf) - - -# no need to hard group by it's already sharded by loan_id -pt <- proc.time() -defaults <- map.disk.frame(fmdf, function(df) { - # create the default flag - df[,date:=as.Date(monthly.rpt.prd,"%m/%d/%Y")] - setkey(df,loan_id, date) - df[,delq.status := as.integer(delq.status)] - df[,default:=delq.status>=3] - df[is.na(default),default:=F] - - df2 = df[default==T,.(loan_id, start_date =date, end_date = date)] - lubridate::month(df2$start_date) <- lubridate::month(df2$start_date)-12 - - # get rid of overlaps - df2[order(start_date),lag_end_date := shift(end_date,1), loan_id] - - df2[,grp_incr := 1] - df2[start_date <= lag_end_date, grp_incr := 0, loan_id] - - df2[,grp:=cumsum(grp_incr)] - df3 = df2[,.(start_date = min(start_date), end_date = max(end_date), default_12m=T),.(loan_id,grp)] - - df3[,grp:=NULL] - - # create the hardship flag - # whether the customer goes into in the next 12 months hardship - df3 - }, - keep = c("monthly.rpt.prd", "delq.status", "loan_id"), - outdir = file.path(outpath, "defaults.df"), - lazy = F, - overwrite = T) -cat(glue::glue("Creating forward looking default flag took {timetaken(pt)}")) - -head(defaults) - -defaults %>% - filter(loan_id == "107150255179") %>% - collect diff --git a/tutorials/fannie_mae_10pct/02b_foverlaps.r b/tutorials/fannie_mae_10pct/02b_foverlaps.r deleted file mode 100644 index d89094de..00000000 --- a/tutorials/fannie_mae_10pct/02b_foverlaps.r +++ /dev/null @@ -1,29 +0,0 @@ -# 2_exploratory.r -source("inst/fannie_mae_10pct/00_setup.r") - -fmdf = disk.frame(file.path(outpath, "fm.df")) -tmp2 = disk.frame(file.path(outpath, "defaults.df")) - -fmdf_lazy = delayed(fmdf, function(df) { - df[,monthly.rpt.prd:=as.Date(monthly.rpt.prd,"%m/%d/%Y")] - df[,monthly.rpt.prd2:=monthly.rpt.prd] -}) - -tmp2_lazy = delayed(tmp2, function(df) { - setkey(df, "loan_id","start_date","end_date") -}) - -# took 513 seconds -pt <- proc.time() -tmp3 <- foverlaps.disk.frame( - fmdf_lazy, - tmp2_lazy, - outdir=file.path(outpath, "fm_with_harp"), - by.x = c("loan_id", "monthly.rpt.prd", "monthly.rpt.prd2"), - by.y = c("loan_id", "start_date", "end_date"), - merge_by_chunk_id = T, - overwrite = T -) -cat(glue::glue("time taken to merge on default flag {timetaken(pt)}")) - -head(tmp3) diff --git a/tutorials/fannie_mae_10pct/02c_plot_odr.r b/tutorials/fannie_mae_10pct/02c_plot_odr.r deleted file mode 100644 index fc8308ab..00000000 --- a/tutorials/fannie_mae_10pct/02c_plot_odr.r +++ /dev/null @@ -1,34 +0,0 @@ -# 2_exploratory.r -source("inst/fannie_mae/0_setup.r") - -fm_with_harp = disk.frame(file.path(outpath, "fm_with_harp")) - -head(fm_with_harp) - -nrow(fm_with_harp) - -# need a two stage summary -system.time(a_wh1 <- fm_with_harp %>% - srckeep(c("default_12m","monthly.rpt.prd")) %>% - group_by(monthly.rpt.prd) %>% - summarise( - N = n(), - n_defaults = sum(default_12m, na.rm = T)) %>% - collect(parallel = T) %>% - group_by(monthly.rpt.prd) %>% - summarise( - odr = sum(n_defaults)/sum(N) - ) %>% - rename( - Date = monthly.rpt.prd, - `Observed Default Rate%` = odr - )) - - -a_wh2 = a_wh1 %>% gather(key = type, value=rate, -Date) - -ggplot(a_wh2) + - geom_line(aes(x=Date, y = rate, colour = type)) + - ggtitle("Fannie Mae Observed Default Rate over time & HARP Conversion Rate") - - diff --git a/tutorials/fannie_mae_10pct/10_a_one_var.R b/tutorials/fannie_mae_10pct/10_a_one_var.R deleted file mode 100644 index e5b3cf03..00000000 --- a/tutorials/fannie_mae_10pct/10_a_one_var.R +++ /dev/null @@ -1,55 +0,0 @@ -source("inst/fannie_mae_10pct/00_setup.r") -library(disk.frame) - -acqall1 = disk.frame(file.path(outpath, "appl_mdl_data")) - -# XGBoost on one var ------------------------------------------------------ -system.time(xy <- acqall1[,c("default_next_12m", "oltv"), keep=c("default_next_12m", "oltv")]) - -# transform into format accepted by XGBoost -dtrain <- xgb.DMatrix(label = xy$default_next_12m, data = as.matrix(xy[,"oltv"])) - -portfolio_default_rate = xy[,sum(default_next_12m, na.rm = T)/.N] - - -# xgboost fit ------------------------------------------------------------- - - -################################################### -# the xgboost fit -################################################### -pt = proc.time() -m <- xgboost( - data=dtrain, - nrounds = 1, - objective = "binary:logitraw", - tree_method="exact", - monotone_constraints = 1, - base_score = portfolio_default_rate) -timetaken(pt) - -# plotting ---------------------------------------------------------------- - - -################################################### -# plotting -################################################### -prev_pred = predict(m, dtrain) - -map_chr(xgb.dump(m), ~str_extract(.x,"\\[f0<[\\d]+\\.[\\d]+\\]")) %>% - keep(~!is.na(.x)) %>% - map_dbl(~str_extract(.x, "[\\d]+\\.[\\d]+") %>% as.numeric) %>% - sort %>% - floor -> bins - -bb = xy[,.(ndef = sum(default_next_12m), .N, m1 = min(oltv), m2 = max(oltv)), .(bins = cut(oltv,c(-Inf,bins,Inf)))] -new_bins = sort(unique( bb$m2)) -bb = xy[,.(ndef = sum(default_next_12m), .N), .(bins = cut(oltv,c(-Inf,new_bins,Inf)))][order(bins)] - -setkey(bb, bins) -bb %>% - filter(!is.na(bins)) %>% - mutate(`Orig LTV Band` = bins, `Default Rate%` = ndef/N) %>% - ggplot + - geom_bar(aes(x = `Orig LTV Band`, y = `Default Rate%`), stat = 'identity') + - coord_flip() diff --git a/tutorials/fannie_mae_10pct/10_a_one_var_with_fn.R b/tutorials/fannie_mae_10pct/10_a_one_var_with_fn.R deleted file mode 100644 index 266fb107..00000000 --- a/tutorials/fannie_mae_10pct/10_a_one_var_with_fn.R +++ /dev/null @@ -1,56 +0,0 @@ -source("inst/fannie_mae/0_setup.r") -library(disk.frame) - -acqall1 = disk.frame(file.path(outpath, "appl_mdl_data_sampled_dev2")) - -pt <- proc.time() -firstm <- disk.frame:::add_var_to_scorecard(acqall1, "default_next_12m", "oltv", monotone_constraints = 1, format_fn = function(v) { - ceiling(v / 5) * 5 -}) -timetaken(pt) - -firstm - -# now add the second variable --------------------------------------------- -pt <- proc.time() -secondm <- disk.frame:::add_var_to_scorecard( - acqall1, - "default_next_12m", - "dti", - prev_pred = firstm$prev_pred, - monotone_constraints = 1) -timetaken(pt) - -pt <- proc.time() -thirdm <- disk.frame:::add_var_to_scorecard( - acqall1, - "default_next_12m", - "orig_trm", - prev_pred = secondm$prev_pred, - monotone_constraints = 1) -timetaken(pt) - -pt <- proc.time() -fourthm <- disk.frame:::add_var_to_scorecard( - acqall1, - "default_next_12m", - "num_bo", - prev_pred = thirdm$prev_pred, - monotone_constraints = -1, - format_fn = as.numeric) -timetaken(pt) - -pt <- proc.time() -fifthm <- disk.frame:::add_var_to_scorecard( - acqall1, - "default_next_12m", - "cscore_b", - prev_pred = fourthm$prev_pred, - monotone_constraints = -1) -timetaken(pt) - - -# plot improvement -------------------------------------------------------- -plot(map_dbl(list(firstm, secondm, thirdm, fourthm, fifthm), ~.x$auc), type="b", ylim = c(0.4,0.9)) -abline(h = 0.7, lty = 2) - diff --git a/tutorials/fannie_mae_10pct/10_b_rounded.R b/tutorials/fannie_mae_10pct/10_b_rounded.R deleted file mode 100644 index 73c1893a..00000000 --- a/tutorials/fannie_mae_10pct/10_b_rounded.R +++ /dev/null @@ -1,123 +0,0 @@ -source("inst/fannie_mae_10pct/0_setup.r") -# what if we fit them in rounded number --------------------------------------------------------- - -xy[oltv > 80, oltv_round := ceiling(oltv/5)*5] -xy[oltv <= 80, oltv_round := ceiling(oltv/10)*10] -xy[oltv <= 40, oltv_round := ceiling(oltv/20)*20] -#xy = xy[!is.na(oltv_round),] - -xy[is.na(oltv_round),] - -xy[,.N, oltv_round] - -# transform into format accepted by XGBoost -dtrain <- xgb.DMatrix(label = xy$default_next_12m, data = as.matrix(xy[,"oltv_round"])) - -pt = proc.time() -m <- xgboost( - data=dtrain, - nrounds = 1, - objective = "binary:logitraw", - tree_method="exact", - monotone_constraints = 1, - base_score = portfolio_default_rate) -timetaken(pt) - -map_chr(xgb.dump(m), ~str_extract(.x,"\\[f0<[\\d]+[\\.]{0,1}[\\d]+\\]")) %>% - keep(~!is.na(.x)) %>% - map_dbl(~str_extract(.x, "[\\d]+[\\.]{0,1}[\\d]+") %>% as.numeric) %>% - sort -> bins - -bb = xy[,.(ndef = sum(default_next_12m), .N, m1 = min(oltv_round), m2 = max(oltv_round)), .(bins = cut(oltv_round,c(-Inf,bins,Inf)))] -new_bins = sort(unique( bb$m2)) -bb = xy[,.(ndef = sum(default_next_12m), .N), .(bins = cut(oltv_round,c(-Inf,new_bins,Inf)))][order(bins)] -bb[,odr := ndef/N] - -setkey(bb, bins) -bb %>% - filter(!is.na(bins)) %>% - mutate(`Orig LTV Band` = bins, `Default Rate%` = ndef/N) %>% - ggplot + - geom_bar(aes(x = `Orig LTV Band`, y = `Default Rate%`), stat = 'identity') + - coord_flip() - - -prev_pred = predict(m, dtrain) - -prev_pred1 = predict(m, dtrain, predcontrib=T) - -# add another variable ---------------------------------------------------- -target = "default_next_12m" -feature = "orig_amt" -df = acqall1 -format_fn = base::I -existing_model = prev_pred -monotone_constraints = -1 - -# auc <- function(target, score) { -# df = data.table(target, score) -# -# df1 = df[,.(nt = sum(target), n = .N, score)] -# setkey(df1, score) -# } -# -# add_var_to_scorecard <- function(df, target, feature, monotone_constraints = 0, prev_pred = NULL, format_fn = base::I) { -# -# xy = df %>% -# srckeep(c(target, feature)) %>% -# collect(parallel = T) -# -# # evaluate -# code = glue::glue("xy = xy %>% mutate({feature} = format_fn({feature}))") -# eval(parse(text = code)) -# -# dtrain <- xgb.DMatrix(label = xy[,target, with = F][[1]], data = as.matrix(xy[,c(feature), with = F])) -# -# if(is.null(prev_pred)) { -# pt = proc.time() -# m2 <- xgboost( -# data=dtrain, -# nrounds = 1, -# objective = "binary:logitraw", -# tree_method="exact", -# monotone_constraints = monotone_constraints -# ) -# timetaken(pt) -# } else { -# setinfo(dtrain, "base_margin", prev_pred) -# pt = proc.time() -# m2 <- xgboost( -# data=dtrain, -# nrounds = 1, -# objective = "binary:logitraw", -# tree_method="exact", -# monotone_constraints = monotone_constraints -# ) -# timetaken(pt) -# -# a2 = predict(m2, dtrain) -# a3 = predict(m2, dtrain, predcontrib = T) -# } -# -# map_chr(xgb.dump(m2), ~str_extract(.x,"\\[f0<[\\d]+[\\.]{0,1}[\\d]+\\]")) %>% -# keep(~!is.na(.x)) %>% -# map_dbl(~str_extract(.x, "[\\d]+[\\.]{0,1}[\\d]+") %>% as.numeric) %>% -# sort -> bins -# -# code = glue::glue("bb = xy[,.(ndef = sum({target}), .N, m1 = min({feature}), m2 = max({feature})), .(bins = cut({feature},c(-Inf,bins,Inf)))]") -# eval(parse(text = code)) -# -# new_bins = sort(unique(bb$m2)) -# code1 = glue::glue("bb = xy[,.(ndef = sum(default_next_12m), .N), .(bins = cut({feature},c(-Inf,new_bins,Inf)))][order(bins)]") -# eval(parse(text = code1)) -# -# setkey(bb, bins) -# bb %>% -# filter(!is.na(bins)) %>% -# mutate(`Orig LTV Band` = bins, `Default Rate%` = ndef/N) %>% -# ggplot + -# geom_bar(aes(x = `Orig LTV Band`, y = `Default Rate%`), stat = 'identity') + -# coord_flip() -# -# -# } \ No newline at end of file diff --git a/tutorials/fannie_mae_10pct/10_b_two_vars.r b/tutorials/fannie_mae_10pct/10_b_two_vars.r deleted file mode 100644 index d0df6cd4..00000000 --- a/tutorials/fannie_mae_10pct/10_b_two_vars.r +++ /dev/null @@ -1,61 +0,0 @@ -source("inst/fannie_mae/0_setup.r") -library(disk.frame) - -acqall1 = disk.frame(file.path(outpath, "appl_mdl_data_sampled_dev2")) - -pt <- proc.time() -firstm <- add_var_to_scorecard( - acqall1, - "default_next_12m", - "oltv", - monotone_constraints = 1, - format_fn = function(v) { - ceiling(v / 5) * 5 - }) -timetaken(pt) - -firstm - -# now add the second variable --------------------------------------------- -pt <- proc.time() -secondm <- add_var_to_scorecard( - acqall1, - "default_next_12m", - "dti", - prev_pred = firstm$prev_pred, - monotone_constraints = 1) -timetaken(pt) - -pt <- proc.time() -thirdm <- add_var_to_scorecard( - acqall1, - "default_next_12m", - "orig_trm", - prev_pred = secondm$prev_pred, - monotone_constraints = 1) -timetaken(pt) - -pt <- proc.time() -fourthm <- add_var_to_scorecard( - acqall1, - "default_next_12m", - "num_bo", - prev_pred = thirdm$prev_pred, - monotone_constraints = -1, - format_fn = as.numeric) -timetaken(pt) - -pt <- proc.time() -fifthm <- add_var_to_scorecard( - acqall1, - "default_next_12m", - "cscore_b", - prev_pred = fourthm$prev_pred, - monotone_constraints = -1) -timetaken(pt) - - -# plot improvement -------------------------------------------------------- -#plot(map_dbl(list(firstm, secondm, thirdm, fourthm, fifthm), ~.x$auc), type="b", ylim = c(0.4,0.9)) -#abline(h = 0.7, lty = 2) - diff --git a/tutorials/fannie_mae_10pct/10_c_two_vars.R b/tutorials/fannie_mae_10pct/10_c_two_vars.R deleted file mode 100644 index 3124d577..00000000 --- a/tutorials/fannie_mae_10pct/10_c_two_vars.R +++ /dev/null @@ -1,64 +0,0 @@ -source("inst/fannie_mae_10pct/0_setup.r") -library(disk.frame) - -acqall1 = disk.frame(file.path(outpath, "appl_mdl_data")) - -pt <- proc.time() -firstm <- add_var_to_scorecard( - acqall1, - "default_next_12m", - "oltv", - monotone_constraints = 1, - format_fn = function(v) { - ceiling(v / 5) * 5 - }) -timetaken(pt) - -firstm - -# now add the second variable --------------------------------------------- -pt <- proc.time() -secondm <- add_var_to_scorecard( - acqall1, - "default_next_12m", - "dti", - prev_pred = firstm$prev_pred, - monotone_constraints = 1) -timetaken(pt) -secondm - -pt <- proc.time() -thirdm <- add_var_to_scorecard( - acqall1, - "default_next_12m", - "orig_trm", - prev_pred = secondm$prev_pred, - monotone_constraints = 1) -timetaken(pt) - -pt <- proc.time() -fourthm <- add_var_to_scorecard( - acqall1, - "default_next_12m", - "num_bo", - prev_pred = thirdm$prev_pred, - monotone_constraints = -1, - format_fn = as.numeric) -timetaken(pt) - -pt <- proc.time() -fifthm <- add_var_to_scorecard( - acqall1, - "default_next_12m", - "cscore_b", - prev_pred = fourthm$prev_pred, - monotone_constraints = -1) -timetaken(pt) - - -# plot improvement -------------------------------------------------------- -plot( - map_dbl( - list(firstm, secondm, thirdm, fourthm, fifthm), ~.x$auc), type="b", ylim = c(0.4,0.9), main="AUC vs number of variables",ylab="AUC",xlab="Number of Variables") -abline(h = 0.7, lty = 2) - diff --git a/tutorials/fannie_mae_10pct/10_e_AutoML_XGBoost_Scorecards.r b/tutorials/fannie_mae_10pct/10_e_AutoML_XGBoost_Scorecards.r deleted file mode 100644 index 66eb27b3..00000000 --- a/tutorials/fannie_mae_10pct/10_e_AutoML_XGBoost_Scorecards.r +++ /dev/null @@ -1,91 +0,0 @@ -source("inst/fannie_mae_10pct/00_setup.r") -library(disk.frame) -library(xgboost) -acqall_dev = disk.frame(file.path(outpath, "appl_mdl_data_sampled")) - -add_var_to_scorecard = disk.frame:::add_var_to_scorecard - -# 5d develop a function to test all variables -check_which_is_best <- function(df, target, features, monotone_constraints, format_fns, weight=NULL) { - - prev_pred = NULL - ws = NULL - vars_scr = NULL - - while(length(features) > 0) { - # try every var - - res = furrr::future_map(1:length(features), ~add_var_to_scorecard( - #res = purrr::map(1:length(features), ~add_var_to_scorecard( - df, - target, - features[.x], - monotone_constraints = monotone_constraints[.x], - prev_pred = prev_pred, - format_fn = format_fns[[.x]], - weight, - save_model_fname = glue::glue("{features[.x]}.xgbm") - )) - - # which has the best auc - w = which.max(map_dbl(res, ~.x$auc)) - ws = c(ws, w) - feature_to_fit = features[w] - print(glue::glue("choosen: {feature_to_fit}")) - - var_scr1 = res[[w]] - eval(parse(text = glue::glue("vars_scr = c(vars_scr, list({feature_to_fit} = var_scr1))"))) - - features = features[-w] - format_fns = format_fns[-w] - monotone_constraints = monotone_constraints[-w] - - prev_pred = var_scr1$prev_pred - } - vars_scr -} - -num_vars = c("mi_pct", "orig_amt", "orig_rt", "ocltv", "mi_type", "cscore_c", "oltv", "dti", "orig_trm", "cscore_b", "num_bo", "num_unit", "orig_dte", "frst_dte") -num_vars_mon = c(0 , 1 , 1 , 1 , 0 , -1 , 1 , 1 , 1 , -1 , -1 , 0 , 0 , 0) -num_var_fmt_fn = c(map(1:10, ~base::I), map(1:2, ~as.numeric), map(1:2, ~function(x) { - as.numeric(substr(x, 4, 7)) -})) - -cat_vars = setdiff(names(acqall_dev), num_vars) %>% - setdiff( - c("loan_id", "default_next_12m", "first_default_date", "mths_to_1st_default", "zip_3", "weight")) - -pt = proc.time() -res_all = check_which_is_best( - acqall_dev, - "default_next_12m", - features = c(num_vars, cat_vars), - monotone_constraints = c(num_vars_mon, rep(0, length(cat_vars))), - format_fns = c(num_var_fmt_fn, map(1:length(cat_vars), ~base::I)), - weight = "weight" -) -timetaken(pt) - -plot(map_dbl(res_all, ~.x$auc)) - -#saveRDS(res_all, "model.rds") - -if(F) { - rescat = check_which_is_best( - acqall_dev, - "default_next_12m", - c("seller.name", "oltv", "state", "dti", "mi_type"), - c(0 , 1 , 0 , 1 , 0 ), - map(1:5, ~base::I) - ) - - - system.time(res <- check_which_is_best( - acqall1, - "default_next_12m", - c("ocltv", "cscore_c", "mi_type", "oltv", "dti", "orig_trm", "cscore_b", "num_bo"), - c(1 , -1 , 0 , 1 , 1 , 1 , -1 , -1 ), - c(map(1:7, ~base::I), list(as.numeric)))) -} - - diff --git a/tutorials/fannie_mae_10pct/10_f_make_scorecard_visible.R b/tutorials/fannie_mae_10pct/10_f_make_scorecard_visible.R deleted file mode 100644 index 36fb832d..00000000 --- a/tutorials/fannie_mae_10pct/10_f_make_scorecard_visible.R +++ /dev/null @@ -1,109 +0,0 @@ -source("inst/fannie_mae/0_setup.r") -library(disk.frame) - -acqall_val = disk.frame(file.path(outpath, "appl_mdl_data_sampled_val2")) -df = acqall_val -#the model -mdl = readRDS("model.rds") - -plot(purrr::map_dbl(mdl, ~.x$auc)) - -#' score one variable -score_one_var <- function(feature, x, bins, bias, format_fn = base::I) { - print(feature) - # numeric features (post format_fn) has only two columns in bins - if(ncol(bins) == 2) { - x1 = format_fn(x) - binsNA = evalparseglue("bins[is.na({feature}),]") - bins = evalparseglue("bins[!is.na({feature}),]") - score = bins$score[cut(x1, c(-Inf, bins[[2]], Inf))] - - if(nrow(binsNA) == 1) { - score[is.na(x1)] = binsNA[[1]] - } - } else { - x1 = format_fn(x) - x1df = evalparseglue("data.table({feature} = x1)") - x1df_scr = evalparseglue("merge(x1df, bins, by = '{feature}', all.x=T)") - score = x1df_scr$score - } - # if there are still na then assign the bias - #browser(expr = any(is.na(score))) - if(any(is.na(score))) { - warning(glue::glue("the feature `{feature}` has {sum(is.na(score))} unassigned scores out of {length(score)}, assigning them the bias = {bias}")) - score[is.na(score)] = bias - } - - score -} - - -#' Socre a scorecard -#' @param mdl the xg scorecard model -#' @param df a disk.frame -score_xg_scorecard <- function(df, mdl) { - res = furrr::future_map_dfc(mdl, ~{ - #res = purrr::map_dfc(mdl, ~{ - var = df %>% - srckeep(.x$feature) %>% - collect(parallel = F) - x = var[[1]] - feature = .x$feature - bins = .x$bins - bias = .x$bias - format_fn = .x$format_fn - score_one_var(feature, x, bins, bias, format_fn) - }) %>% rowSums - res -} - -plot_auc <- function(df) { - target = df %>% - srckeep("default_next_12m") %>% - collect(parallel = F) - - pt = proc.time() - target$score = score_xg_scorecard(df, mdl) - timetaken(pt) - - AUC = auc(target$default_next_12m, target$score) - GINI = 2*AUC-1 - - target[,negscore := -score] - setkey(target, negscore) - cts = target[,.(ctot = (1:.N)/.N, cbad = cumsum(default_next_12m)/sum(default_next_12m))] - - plot(cts[seq(1,.N, length.out=100),], type="l", xlim=c(0,1), - main=glue::glue("On Validation - AUC: {AUC %>% round(2)}; GINI: {GINI %>% round(2)}")) - abline(a=0,b=1,lty=2) - abline(h=0) - abline(v=0) -} - - -system.time(plot_auc(acqall_val)) - -# score on whole ---------------------------------------------------------- -acqall_dev = disk.frame(file.path(outpath, "appl_mdl_data_sampled_dev2")) -system.time(plot_auc(acqall_dev)) - -scorecard = map_dfr(mdl, ~{ - res = .x$bins - evalparseglue("res[,feature_lbl := as.character({.x$feature})]") - res[,variable := .x$feature] - res %>% - select(variable, feature_lbl, score ) %>% - mutate(score = round(-score*20/log(2))) -}) - -saveRDS(scorecard, "scorecard.rds") -scorecard = readRDS("scorecard.rds") -View(scorecard) - -DT::datatable(scorecard) - -# score on whole ---------------------------------------------------------- -# acqall = disk.frame(file.path(outpath, "appl_mdl_data")) -# system.time(plot_auc(acqall)) - - diff --git a/tutorials/fannie_mae_10pct/10a_AutoML_XGBoost_Scorecards.r b/tutorials/fannie_mae_10pct/10a_AutoML_XGBoost_Scorecards.r deleted file mode 100644 index 6df43da8..00000000 --- a/tutorials/fannie_mae_10pct/10a_AutoML_XGBoost_Scorecards.r +++ /dev/null @@ -1,90 +0,0 @@ -source("inst/fannie_mae_10pct/0_setup.r") -library(disk.frame) - -acqall_dev = disk.frame(file.path(outpath, "appl_mdl_data_sampled")) - - -# 5d develop a function to test all variables -check_which_is_best <- function(df, target, features, monotone_constraints, format_fns, weight=NULL) { - - prev_pred = NULL - ws = NULL - vars_scr = NULL - - while(length(features) > 0) { - # try every var - - res = furrr::future_map(1:length(features), ~add_var_to_scorecard( - #res = purrr::map(1:length(features), ~add_var_to_scorecard( - df, - target, - features[.x], - monotone_constraints = monotone_constraints[.x], - prev_pred = prev_pred, - format_fn = format_fns[[.x]], - weight, - save_model_fname = glue::glue("{features[.x]}.xgbm") - )) - - # which has the best auc - w = which.max(map_dbl(res, ~.x$auc)) - ws = c(ws, w) - feature_to_fit = features[w] - print(glue::glue("choosen: {feature_to_fit}")) - - var_scr1 = res[[w]] - eval(parse(text = glue::glue("vars_scr = c(vars_scr, list({feature_to_fit} = var_scr1))"))) - - features = features[-w] - format_fns = format_fns[-w] - monotone_constraints = monotone_constraints[-w] - - prev_pred = var_scr1$prev_pred - } - vars_scr -} - -num_vars = c("mi_pct", "orig_amt", "orig_rt", "ocltv", "mi_type", "cscore_c", "oltv", "dti", "orig_trm", "cscore_b", "num_bo", "num_unit", "orig_dte", "frst_dte") -num_vars_mon = c(0 , 1 , 1 , 1 , 0 , -1 , 1 , 1 , 1 , -1 , -1 , 0 , 0 , 0) -num_var_fmt_fn = c(map(1:10, ~base::I), map(1:2, ~as.numeric), map(1:2, ~function(x) { - as.numeric(substr(x, 4, 7)) -})) - -cat_vars = setdiff(names(acqall_dev), num_vars) %>% - setdiff( - c("loan_id", "default_next_12m", "first_default_date", "mths_to_1st_default", "zip_3", "weight")) - -pt = proc.time() -res_all = check_which_is_best( - acqall_dev, - "default_next_12m", - features = c(num_vars, cat_vars), - monotone_constraints = c(num_vars_mon, rep(0, length(cat_vars))), - format_fns = c(num_var_fmt_fn, map(1:length(cat_vars), ~base::I)), - weight = "weight" -) -timetaken(pt) - -plot(map_dbl(res_all, ~.x$auc)) - -#saveRDS(res_all, "model.rds") - -if(F) { - rescat = check_which_is_best( - acqall_dev, - "default_next_12m", - c("seller.name", "oltv", "state", "dti", "mi_type"), - c(0 , 1 , 0 , 1 , 0 ), - map(1:5, ~base::I) - ) - - - system.time(res <- check_which_is_best( - acqall1, - "default_next_12m", - c("ocltv", "cscore_c", "mi_type", "oltv", "dti", "orig_trm", "cscore_b", "num_bo"), - c(1 , -1 , 0 , 1 , 1 , 1 , -1 , -1 ), - c(map(1:7, ~base::I), list(as.numeric)))) -} - - diff --git a/tutorials/fannie_mae_10pct/10b_make_scorecard_visible.R b/tutorials/fannie_mae_10pct/10b_make_scorecard_visible.R deleted file mode 100644 index 36fb832d..00000000 --- a/tutorials/fannie_mae_10pct/10b_make_scorecard_visible.R +++ /dev/null @@ -1,109 +0,0 @@ -source("inst/fannie_mae/0_setup.r") -library(disk.frame) - -acqall_val = disk.frame(file.path(outpath, "appl_mdl_data_sampled_val2")) -df = acqall_val -#the model -mdl = readRDS("model.rds") - -plot(purrr::map_dbl(mdl, ~.x$auc)) - -#' score one variable -score_one_var <- function(feature, x, bins, bias, format_fn = base::I) { - print(feature) - # numeric features (post format_fn) has only two columns in bins - if(ncol(bins) == 2) { - x1 = format_fn(x) - binsNA = evalparseglue("bins[is.na({feature}),]") - bins = evalparseglue("bins[!is.na({feature}),]") - score = bins$score[cut(x1, c(-Inf, bins[[2]], Inf))] - - if(nrow(binsNA) == 1) { - score[is.na(x1)] = binsNA[[1]] - } - } else { - x1 = format_fn(x) - x1df = evalparseglue("data.table({feature} = x1)") - x1df_scr = evalparseglue("merge(x1df, bins, by = '{feature}', all.x=T)") - score = x1df_scr$score - } - # if there are still na then assign the bias - #browser(expr = any(is.na(score))) - if(any(is.na(score))) { - warning(glue::glue("the feature `{feature}` has {sum(is.na(score))} unassigned scores out of {length(score)}, assigning them the bias = {bias}")) - score[is.na(score)] = bias - } - - score -} - - -#' Socre a scorecard -#' @param mdl the xg scorecard model -#' @param df a disk.frame -score_xg_scorecard <- function(df, mdl) { - res = furrr::future_map_dfc(mdl, ~{ - #res = purrr::map_dfc(mdl, ~{ - var = df %>% - srckeep(.x$feature) %>% - collect(parallel = F) - x = var[[1]] - feature = .x$feature - bins = .x$bins - bias = .x$bias - format_fn = .x$format_fn - score_one_var(feature, x, bins, bias, format_fn) - }) %>% rowSums - res -} - -plot_auc <- function(df) { - target = df %>% - srckeep("default_next_12m") %>% - collect(parallel = F) - - pt = proc.time() - target$score = score_xg_scorecard(df, mdl) - timetaken(pt) - - AUC = auc(target$default_next_12m, target$score) - GINI = 2*AUC-1 - - target[,negscore := -score] - setkey(target, negscore) - cts = target[,.(ctot = (1:.N)/.N, cbad = cumsum(default_next_12m)/sum(default_next_12m))] - - plot(cts[seq(1,.N, length.out=100),], type="l", xlim=c(0,1), - main=glue::glue("On Validation - AUC: {AUC %>% round(2)}; GINI: {GINI %>% round(2)}")) - abline(a=0,b=1,lty=2) - abline(h=0) - abline(v=0) -} - - -system.time(plot_auc(acqall_val)) - -# score on whole ---------------------------------------------------------- -acqall_dev = disk.frame(file.path(outpath, "appl_mdl_data_sampled_dev2")) -system.time(plot_auc(acqall_dev)) - -scorecard = map_dfr(mdl, ~{ - res = .x$bins - evalparseglue("res[,feature_lbl := as.character({.x$feature})]") - res[,variable := .x$feature] - res %>% - select(variable, feature_lbl, score ) %>% - mutate(score = round(-score*20/log(2))) -}) - -saveRDS(scorecard, "scorecard.rds") -scorecard = readRDS("scorecard.rds") -View(scorecard) - -DT::datatable(scorecard) - -# score on whole ---------------------------------------------------------- -# acqall = disk.frame(file.path(outpath, "appl_mdl_data")) -# system.time(plot_auc(acqall)) - - diff --git a/tutorials/fannie_mae_10pct/11_speedglm.R b/tutorials/fannie_mae_10pct/11_speedglm.R deleted file mode 100644 index 33b1323d..00000000 --- a/tutorials/fannie_mae_10pct/11_speedglm.R +++ /dev/null @@ -1,50 +0,0 @@ -source("inst/fannie_mae/0_setup.r") -library(disk.frame) -library(speedglm) -library(biglm) - -acqall_dev = disk.frame(file.path(outpath, "appl_mdl_data")) -library(speedglm) - -head(acqall_dev) - -#' A streaming function for speedglm -#' @param df a disk.frame -stream_shglm <- function(df) { - i = 0 - is = sample(nchunks(df), replace = F) - function(reset = F) { - - if(reset) { - print("reset") - i <<- 0 - } else { - i <<- i + 1 - #if (i > 3) { - if (i > nchunks(df)) { - return(NULL) - } - print(glue::glue("streaming: {i}/{nchunks(df)}")) - return(get_chunk(df, is[i])) - #return(get_chunk(df, i)) - } - } -} - -acqall_dev1 = acqall_dev %>% delayed(~{ - .x[,oltv_band := cut(oltv, c(-Inf, seq(60,100,by=20)))] - .x[,dti_band := addNA(cut(dti, c(-Inf, c(20,100), Inf)))] - .x -}) - -streamacq = stream_shglm(acqall_dev1) - -m = bigglm( - default_next_12m ~ oltv_band-1, - data = streamacq, - family=binomial()) -summary(m) - -shglm(default_next_12m ~ oltv_band -1 #+ dti_band + - 1 - , - datafun = streamacq, family=binomial()) \ No newline at end of file diff --git a/tutorials/fannie_mae_10pct/12_keras.r b/tutorials/fannie_mae_10pct/12_keras.r deleted file mode 100644 index e43f2012..00000000 --- a/tutorials/fannie_mae_10pct/12_keras.r +++ /dev/null @@ -1,178 +0,0 @@ -source("inst/fannie_mae_10pct/0_setup.r") -library(disk.frame) -library(keras) - -acqall_dev = disk.frame(file.path(outpath, "appl_mdl_data_sampled")) - -#' A streaming function for speedglm -#' @param df a disk.frame -stream_shglm <- function(df) { - i = 0 - is = sample(nchunks(df), replace = F) - function(reset = F) { - - if(reset) { - print("you've reset") - i <<- 0 - } else { - i <<- i + 1 - #if (i > 4) { - if (i > nchunks(df)) { - return(NULL) - } - print(glue::glue("{i}/{nchunks(df)}")) - return(get_chunk(df, is[i])) - #return(get_chunk(df, i)) - } - } -} - -acqall_dev1 = acqall_dev %>% delayed(~{ - .x[,oltv_band := cut(oltv, c(-Inf, 60,80, Inf))] - #.x[,scr_band := addNA(cut(cscore_b, c(-Inf, 627,700, Inf)), ifany=F)] - .x[,scr_band := addNA(cut(cscore_b, c(-Inf, 700,716,725,742,748,766,794, Inf)), ifany=F)] - - .x -}) - -if(F) { - aa = acqall_dev1 %>% collect - glm(default_next_12m ~ oltv_band + scr_band - 1, data=aa) -} - -head(acqall_dev1) - -build_model <- function() { - model <- keras_model_sequential() %>% - layer_dense(units = 2, activation = 'softmax') - - - model %>% compile( - loss = "categorical_crossentropy", - optimizer = 'sgd' - ) - - model -} - -model = build_model() - -#ol = levels(acqall_dev1[,oltv_band, keep=c("oltv","cscore_b")]) -#dl = levels(acqall_dev1[,scr_band, keep=c("oltv","cscore_b")]) - -ol = levels(get_chunk(acqall_dev1,1)[,oltv_band]) -dl = levels(get_chunk(acqall_dev1,1)[,scr_band]) - - -#for(i in 1:nchunks(acqall_dev1)) { -kk <- function() { - - j = 0 - done = F - ii = 0 - while(!done) { - j <- j + 1 - si = sample(nchunks(acqall_dev1), nchunks(acqall_dev1)*0.3) - osi = setdiff(1:nchunks(acqall_dev1), si) - - system.time(a <- map_dfr(si, ~{ - - ii <- ii + 1 - i = .x - if(ii %% 20 == 0) print(glue::glue("{j}:{ii} {Sys.time()}")) - - a = get_chunk(acqall_dev1, i) - #a[,.(sum(default_next_12m), .N),oltv_band] - - a1 = - cbind( - a[,keras::to_categorical(as.integer(oltv_band) - 1)] - #,a[,keras::to_categorical(as.integer(scr_band) - 1)] - ) - - at = a[,default_next_12m*1] - Y_train = keras::to_categorical(at) - - hist = model %>% fit( - a1, - Y_train, - epochs = 1, - validation_split = 0.2, - verbose = 0 - ) - - gw = get_weights(model) - - gwdt = as.data.table(gw[1]) - setnames(gwdt, names(gwdt), c("non_default", "default")) - gwdt$i =i - gwdt$var = c( - rep("oltv_band", length(ol)) - #, rep("scr_band", length(dl)) - ) - gwdt$band = c(ol,dl - ) - gwdt - })) - - some_chunks <- map(osi, ~{ - i = .x - a = get_chunk(acqall_dev1, i) - - a1 = - cbind( - a[,keras::to_categorical(as.integer(oltv_band) -1)] - #,a[,keras::to_categorical(as.integer(scr_band)-1)] - ) - a1 - }) %>% reduce(rbind) - - outcomes <- map(osi, ~{ - i = .x - a = get_chunk(acqall_dev, i, keep=c("default_next_12m")) - a[,default_next_12m] - }) %>% unlist - - auc = auc(outcomes, predict(model, some_chunks)[,2]) - - # scores - p = predict(model, diag( - length(ol) - #+length(dl) - ))[,2] - a_b = log(p/(1-p)) - - scalar = 20/log(2) - intercept = 20*log(536870912/15)/log(2) - - a = mean(a_b) - b = a_b - a - done <- length(unique(sign(diff(b)))) == 1 - - scrs = data.table(base_scr = round(a*scalar+intercept,0), scr = round(-b*scalar,0)) - scrs$var = c( - rep("oltv_band", length(ol)) - #, rep("scr_band", length(dl)) - ) - scrs$band = c(ol - , dl - ) - scrs = scrs[band != "bias"] - - scrs = scrs[,.(var, band, scr, base_scr)] - print(glue::glue("AUC: {auc}")) - print(scrs) - } - scrs -} - -pt = proc.time() -scrs = kk() -timetaken(pt) -View(scrs) - -#AUC: 0.599790504924901 -# var band scr base_scr -# 1: oltv_band (-Inf,60] 10 351 -# 2: oltv_band (60,80] 10 351 -# 3: oltv_band (80, Inf] -21 351 \ No newline at end of file diff --git a/tutorials/fannie_mae_new/00_setup.r b/tutorials/fannie_mae_new/00_setup.r deleted file mode 100644 index 5bb675ec..00000000 --- a/tutorials/fannie_mae_new/00_setup.r +++ /dev/null @@ -1,66 +0,0 @@ -library(glue) -library(purrr) -library(fst) -library(tidyr) -library(ggplot2) -library(stringr) -library(xgboost) -library(lubridate) -library(future.apply) -library(data.table) -library(disk.frame) - -setup_disk.frame() - - -raw_harp_data_path = "c:/data/fannie_mae/harp_files" -#raw_harp_data_path = "D:/data/fannie_mae/harp_files" - -#raw_perf_data_path = "c:/data/Performance_All/" - -## read the split data set is ALOT (6x) faster -raw_perf_data_path = "c:/data/Performance_All_split/" -#raw_perf_data_path = "d:/data/Performance_All/" - - -raw_harp_data_path = "c:/data/fannie_mae/harp_files" -#raw_harp_data_path = "D:/data/fannie_mae/harp_files" - -# level of compression from 1 to 100 where 100 is the highest level -# compress = 50 - -# where the outputs go -#outpath = "c:/data/fannie_mae_disk_frame/" -outpath = "c:/data/fannie_mae_disk_frame/" - -#acqzip_file_path = "c:/data/Acquisition_All.zip" -acqzip_file_path = "c:/data/fannie_mae/Acquisition_All.zip" - -#appl_path = "C:/data/" -appl_path = "c:/data" - -Performance_ColClasses = - c("character", "character", "character", "numeric", "numeric", "numeric", "numeric", - "numeric", "character", "character", "character", "character", "character", "character", - "character", "character", "character", "numeric", "numeric", "numeric", "numeric", "numeric", - "numeric", "numeric", "numeric", "numeric", "numeric", "numeric", "character", "numeric", - "character") -Performance_Variables = - c("LOAN_ID", "Monthly.Rpt.Prd", "Servicer.Name", "LAST_RT", "LAST_UPB", "Loan.Age", - "Months.To.Legal.Mat" , "Adj.Month.To.Mat", "Maturity.Date", "MSA", "Delq.Status", - "MOD_FLAG", "Zero.Bal.Code", "ZB_DTE", "LPI_DTE", "FCC_DTE","DISP_DT", "FCC_COST", - "PP_COST", "AR_COST", "IE_COST", "TAX_COST", "NS_PROCS", "CE_PROCS", "RMW_PROCS", - "O_PROCS", "NON_INT_UPB", "PRIN_FORG_UPB_FHFA", "REPCH_FLAG", "PRIN_FORG_UPB_OTH", - "TRANSFER_FLG") %>% tolower - -dfiles = dir(raw_perf_data_path, full.names = T) -short_dfiles = dir(raw_perf_data_path) - -Acquisitions_Variables = c("LOAN_ID", "ORIG_CHN", "Seller.Name", "ORIG_RT", "ORIG_AMT", "ORIG_TRM", "ORIG_DTE" - ,"FRST_DTE", "OLTV", "OCLTV", "NUM_BO", "DTI", "CSCORE_B", "FTHB_FLG", "PURPOSE", "PROP_TYP" - ,"NUM_UNIT", "OCC_STAT", "STATE", "ZIP_3", "MI_PCT", "Product.Type", "CSCORE_C", "MI_TYPE", "RELOCATION_FLG") %>% tolower() - -Acquisition_ColClasses = c("character", "character", "character", "numeric", "numeric", "integer", "character", "character", "numeric", - "numeric", "character", "numeric", "numeric", "character", "character", "character", "character", "character", - "character", "character", "numeric", "character", "numeric", "numeric", "character") - diff --git a/tutorials/fannie_mae_new/01a_read_from_csv.r b/tutorials/fannie_mae_new/01a_read_from_csv.r deleted file mode 100644 index e97b7c65..00000000 --- a/tutorials/fannie_mae_new/01a_read_from_csv.r +++ /dev/null @@ -1,40 +0,0 @@ -source("tutorials/fannie_mae/00_setup.r") - -# number of rows to read in from each file in one go -nreadin = NULL - -# your ram size in G -# this is needed as memomry.limit seems broken -ram.size = 64 -nc = parallel::detectCores(logical = FALSE) -conservatism = 2 -minchunks = nc -# compression ratio, max = 100 for best compression but slower running speed -# compress = 100 - -# set up some variable for future use - -full_file_path = dir(raw_perf_data_path, full.names = T) - - -# randomise it to maximize the chance of a good load balancing -#set.seed(1) -#full_file_path = sample(full_file_path, length(full_file_path)) - -# convert CSV in parallel -pt <- proc.time() -tot_file_size = sum(file.size(full_file_path))/1024^3 -res = csv_to_disk.frame( - full_file_path[1:6], - outdir = file.path(outpath, "raw_fannie_mae"), - #shardby = "loan_id", - colClasses = Performance_ColClasses, - col.names = Performance_Variables, - sep = "|", - in_chunk_size = nreadin, - nchunks = max(round(tot_file_size/ram.size*nc)*nc*conservatism, nc), - overwrite = TRUE, - .progress = TRUE # show progress -) -print(data.table::timetaken(pt)) - diff --git a/tutorials/fannie_mae_new/01d_OPTIONAL_rechunk.r b/tutorials/fannie_mae_new/01d_OPTIONAL_rechunk.r deleted file mode 100644 index 1ee3669c..00000000 --- a/tutorials/fannie_mae_new/01d_OPTIONAL_rechunk.r +++ /dev/null @@ -1,5 +0,0 @@ -library(disk.frame) - -fmdf <- disk.frame("fmdf") - -rechunk(fmdf, 128) diff --git a/tutorials/fannie_mae_new/01e_read_data_for_harp.r b/tutorials/fannie_mae_new/01e_read_data_for_harp.r deleted file mode 100644 index b9f1173c..00000000 --- a/tutorials/fannie_mae_new/01e_read_data_for_harp.r +++ /dev/null @@ -1,85 +0,0 @@ -#1c2 -source("tutorials/fannie_mae/00_setup.r") - -pt <- proc.time() - -# rows to read in one go -rows_to_read = 1e7 - -# load the Fannie Mae disk.frame -fmdf <- disk.frame(file.path(outpath, "fm.df")) - -#system.time(harp <- fread("C:/data/HARP_Files/Performance_HARP.txt", colClasses = Performance_ColClasses, col.names = Performance_Variables)) -harp_mapping <- fread(file.path(raw_harp_data_path,"Loan_Mapping.txt"), colClasses = "c", col.names = c("loan_id", "harp_id")) -setkey(harp_mapping, harp_id) -fst::write_fst(harp_mapping,"harp_mapping.fst.tmp") -fs::file_move("harp_mapping.fst.tmp", "harp_mapping.fst") -print("reading in and saving HARP mapping file took: ") -print(data.table::timetaken(pt)) - -# took about 438.65 on laptop 500 chunks -# took about 205 on desktop 56 chunks -pt <- proc.time() -system.time( - harp <- csv_to_disk.frame(file.path(raw_harp_data_path,"Performance_HARP.txt"), inmapfn = function(df) { - setnames(df, "loan_id", "harp_id") - - merge(df, harp_mapping, by="harp_id") - }, - nchunks = nchunks(fmdf), - in_chunk_size = rows_to_read, - shardby = "loan_id", - outdir = file.path(outpath, "harp.df"), - colClasses = Performance_ColClasses, - col.names = Performance_Variables, - sep="|")) - -print("reading in and saving HARP mapping file took: ") -print(data.table::timetaken(pt)) - - -if(F) { - # it can be seen that some accounts can start n harp the month the same that it ends in the dataset - harp1 = get_chunk.disk.frame(harp,1) - fmdf1 = get_chunk.disk.frame(fmdf,1) - - harp1[,date:=as.Date(monthly.rpt.prd,"%m/%d/%Y")] - fmdf1[,date:=as.Date(monthly.rpt.prd,"%m/%d/%Y")] - - fmdf2 = fmdf1[loan_id %in% unique(harp1$loan_id)] - - fmdf3 = merge( - fmdf2[,.(max_date = max(date)),loan_id], - harp1[,.(min_date = min(date)),loan_id], - by = "loan_id" - ) - - fmdf3[max_date == min_date,] - - fmdf3[max_date > min_date,] - fmdf3[max_date <= min_date,] - - harp4 = harp[,.N, delq.status, keep="delq.status"] -} - - -# print(nrow(harp)) -# system.time(harp <- merge(harp, harp_mapping, by="harp_id")) -# print(nrow(harp)) -# setkey(harp,"loan_id") -# system.time(fst::write_fst(harp,"harp.fst")) # 16.63 -#system.time(fst::write_fst(harp,"harp100.fst",100)) # 391 seconds - -# check if the harp_id -> loan_id is successful -if(F) { - harp_acq <- fread("C:/data/HARP_Files/Acquisition_HARP.txt") - fst::write_fst(harp_mapping,"harp_mapping.fst") - - harp[,date:=as.Date(month,"%m/%d/%Y")] - harp[,.N,date][order(date)] - fmdf = disk.frame("fmdf") - system.time(uid <- fmdf[,.(loan_id = unique(loan_id)), keep = "loan_id"][,unique(loan_id)]) - harp_uid = unique(harp_mapped$loan_id) - def = intersect(uid, harp_uid) - def -} \ No newline at end of file diff --git a/tutorials/fannie_mae_new/02_exploratory.r b/tutorials/fannie_mae_new/02_exploratory.r deleted file mode 100644 index 2a362d81..00000000 --- a/tutorials/fannie_mae_new/02_exploratory.r +++ /dev/null @@ -1,19 +0,0 @@ -# 2_exploratory.r -source("inst/fannie_mae/0_setup.r") - -fmdf = disk.frame("fmdf0") - -system.time(a <- fmdf[,.N,delq.status,keep="delq.status"][,.(N=sum(N)),delq.status]) -setkey(a,delq.status); a - -str(a) - -str(head(fmdf)) - -b = fmdf[,.N,monthly.rpt.prd,keep="monthly.rpt.prd"][,sum(N),monthly.rpt.prd]; b - - -names(fmdf) -servicer.name_cnt = fmdf[,.N,servicer.name,keep=c("servicer.name")] - -servicer.name_cnt[,sum(N),servicer.name] diff --git a/tutorials/fannie_mae_new/02a_create_forward_looking_flag.r b/tutorials/fannie_mae_new/02a_create_forward_looking_flag.r deleted file mode 100644 index 2d7dc6bc..00000000 --- a/tutorials/fannie_mae_new/02a_create_forward_looking_flag.r +++ /dev/null @@ -1,97 +0,0 @@ -# 2_exploratory.r -source("inst/fannie_mae/0_setup.r") - -fmdf = disk.frame(file.path(outpath, "fm.df")) -harp = disk.frame(file.path(outpath, "harp.df")) - -# harp contains the first date on which a loan enters into HARP -pt <- proc.time() -harp1 <- harp %>% - srckeep(c("loan_id","monthly.rpt.prd")) %>% - group_by(loan_id, hard = F) %>% # the data is sharded by loan_id hence hard = F is fine - summarise(first_harp_date = min(as.Date(monthly.rpt.prd, "%m/%d/%Y"))) %>% - collect(parallel = T) # performs the collection in parallel - -cat(glue("Creating first harp date took: {timetaken(pt)}")) - -if(F) { - # data.table syntax - harp[,.(first_harp_date = min(as.Date(monthly.rpt.prd, "%m/%d/%Y"))), loan_id, - keep=c("loan_id", "monthly.rpt.prd")] -} - -harp1[,before_12m_first_harp_date:=first_harp_date] -# 164 seconds -system.time(lubridate::month(harp1$before_12m_first_harp_date) <- lubridate::month(harp1$before_12m_first_harp_date) - 12) -harp1[,harp_12m := TRUE] - -# break it out into smaller chunks for even faster merging -# took 27 -pt <- proc.time() -harp2 <- shard( - harp1, - "loan_id", - nchunks = nchunks(fmdf), - outdir = file.path(outpath, "first_harp_date.df"), - overwrite=T) -cat(glue::glue("sharding HARP defaults took {timetaken(pt)}")) - -# no need to hard group by it's already sharded by loan_id -# took about 503 seconds -# took about 11:52 -pt <- proc.time() -defaults <- cmap(fmdf, function(df) { - # create the default flag - df[,date:=as.Date(monthly.rpt.prd,"%m/%d/%Y")] - setkey(df,loan_id, date) - df[,delq.status := as.integer(delq.status)] - df[,default:=delq.status>=3] - df[is.na(default),default:=F] - - df2 = df[default==T,.(loan_id, start_date =date, end_date = date)] - lubridate::month(df2$start_date) <- lubridate::month(df2$start_date)-12 - - # get rid of overlaps - df2[order(start_date),lag_end_date := shift(end_date,1), loan_id] - - df2[,grp_incr := 1] - df2[start_date <= lag_end_date, grp_incr := 0, loan_id] - - df2[,grp:=cumsum(grp_incr)] - df3 = df2[,.(start_date = min(start_date), end_date = max(end_date), default_12m=T),.(loan_id,grp)] - - df3[,grp:=NULL] - - # create the hardship flag - # whether the customer goes into in the next 12 months hardship - df3 - }, - keep = c("monthly.rpt.prd", "delq.status", "loan_id"), - outdir = file.path(outpath, "defaults.df"), - lazy = F, - overwrite = T) -cat(glue::glue("Creating forward looking default flag took {timetaken(pt)}")) - -# -# a = df[loan_id == "100513171914", ] -# -# a[,monthly.rpt.prd := as.Date(monthly.rpt.prd,"%m/%d/%Y")] -# a[,monthly.rpt.prd2:=monthly.rpt.prd] -# -# -# setkey(df3,"loan_id","start_date","end_date") -# -# a1= foverlaps(a, df3, by.x = c("loan_id","monthly.rpt.prd","monthly.rpt.prd2"), -# by.y=c("loan_id","start_date","end_date")) -# -# a1[,.(loan_id,monthly.rpt.prd,start_date, end_date)] - -# create a list that show defaults ---------------------------------------- -# system.time(defaults <- chunk_lapply(fmdf, function(df) { -# df[,date:=as.Date(monthly.rpt.prd,"%m/%d/%Y")] -# df[,delq.status := as.integer(delq.status)] -# df[,default:=delq.status>=3] -# df[is.na(default),default:=F] -# df1 = df[default == F] -# df1 -# }, outdir="tmp1")) diff --git a/tutorials/fannie_mae_new/02b_foverlaps.r b/tutorials/fannie_mae_new/02b_foverlaps.r deleted file mode 100644 index 215c7890..00000000 --- a/tutorials/fannie_mae_new/02b_foverlaps.r +++ /dev/null @@ -1,45 +0,0 @@ -# 2_exploratory.r -source("tutorials/fannie_mae/00_setup.r") - -fmdf = disk.frame(file.path(outpath, "fm.df")) -tmp2 = disk.frame(file.path(outpath, "defaults.df")) -harp2 = disk.frame(file.path("first_harp_date.df")) - -fmdf_lazy = delayed(fmdf, function(df) { - df[,monthly.rpt.prd:=as.Date(monthly.rpt.prd,"%m/%d/%Y")] - df[,monthly.rpt.prd2:=monthly.rpt.prd] -}) - -tmp2_lazy = delayed(tmp2, function(df) { - setkey(df, "loan_id","start_date","end_date") -}) - -harp2_lazy = delayed(harp2, function(df) { - setkey(df, "loan_id","before_12m_first_harp_date","first_harp_date") -}) - -# took 513 seconds -pt <- proc.time() -tmp3 <- foverlaps.disk.frame( - fmdf_lazy, - tmp2_lazy, - outdir=file.path(outpath, "fm_with_default"), - by.x = c("loan_id", "monthly.rpt.prd", "monthly.rpt.prd2"), - by.y = c("loan_id", "start_date", "end_date"), - merge_by_chunk_id = T, - overwrite = T -) -cat(glue::glue("time taken to merge on default flag {timetaken(pt)}")) - -# took 6:55 -pt <- proc.time() -tmp4 <- foverlaps.disk.frame( - tmp3, - harp2_lazy, - outdir=file.path(outpath, "fm_with_harp"), - by.x = c("loan_id", "monthly.rpt.prd", "monthly.rpt.prd2"), - by.y = c("loan_id", "before_12m_first_harp_date", "first_harp_date"), - merge_by_chunk_id = T, - overwrite = T -) -cat(glue::glue("time taken to merge on HARP flag {timetaken(pt)}")) \ No newline at end of file diff --git a/tutorials/fannie_mae_new/02b_foverlaps_Reduced16.r b/tutorials/fannie_mae_new/02b_foverlaps_Reduced16.r deleted file mode 100644 index 825d28d4..00000000 --- a/tutorials/fannie_mae_new/02b_foverlaps_Reduced16.r +++ /dev/null @@ -1,68 +0,0 @@ -# 2_exploratory.r -source("inst/fannie_mae/0_setup.r") -pt <- proc.time() - -fmdf = disk.frame("only16.df") -tmp2 = disk.frame("defaults.df") # tmp2 contains all the defaults -harp2 = disk.frame("first_harp_date.df") - -fmdf_lazy = delayed(fmdf, function(df) { - df[,monthly.rpt.prd:=as.Date(monthly.rpt.prd,"%m/%d/%Y")] - df[,monthly.rpt.prd2:=monthly.rpt.prd] -}) - -tmp2_lazy = delayed(tmp2, function(df) { - setkey(df, "loan_id","start_date","end_date") -}) - -harp2_lazy = lazy(harp2, function(df) { - setkey(df, "loan_id","before_12m_first_harp_date","first_harp_date") -}) - -#dh <- rbindlist.disk.frame(tmp2_lazy, harp2_lazy, outdir ="defaults_harp.df", by_chunk_id = T) -#plan(transparent) -system.time(tmp3 <- foverlaps.disk.frame( - fmdf_lazy, - tmp2_lazy, - outdir="tmp3_16", - by.x = c("loan_id", "monthly.rpt.prd", "monthly.rpt.prd2"), - by.y = c("loan_id", "start_date", "end_date"), - merge_by_chunk_id = T, - compress = 50 -)) - - -system.time(tmp4 <- foverlaps.disk.frame( - tmp3, - harp2_lazy, - outdir="tmp4_16", - by.x = c("loan_id", "monthly.rpt.prd", "monthly.rpt.prd2"), - by.y = c("loan_id", "before_12m_first_harp_date", "first_harp_date"), - merge_by_chunk_id = T, - compress = 50 -)) - -data.table::timetaken(pt) - - -tmp4 <- disk.frame("tmp4_16") -system.time(tmp5 <- tmp4[,.(ndef=sum(default_12m, na.rm=T),.N), .(monthly.rpt.prd, delq.status), - keep=c("default_12m","monthly.rpt.prd","delq.status")]) - - -tmp5[, delq.status.cap := pmin(5, as.integer(delq.status))] -tmp5[delq.status == "X", delq.status.cap := 0] -tmp5[delq.status == "", delq.status.cap := 0] - - -tmp6 <- tmp5[,.(ndef=sum(ndef), N=sum(N)),.(monthly.rpt.prd, delq.status.cap)] - - -tmp6[,defr:=ndef/N] - -library(ggplot2) -tmp6 %>% - ggplot + - geom_line(aes(x=monthly.rpt.prd, y = defr, colour=as.factor(delq.status.cap))) - - diff --git a/tutorials/fannie_mae_new/02c_plot_odr.r b/tutorials/fannie_mae_new/02c_plot_odr.r deleted file mode 100644 index 4521511a..00000000 --- a/tutorials/fannie_mae_new/02c_plot_odr.r +++ /dev/null @@ -1,45 +0,0 @@ -# 2_exploratory.r -source("inst/fannie_mae/0_setup.r") - -fm_with_harp = disk.frame(file.path(outpath, "fm_with_harp")) - -head(fm_with_harp) - -# need a two stage summary -system.time(a_wh1 <- fm_with_harp %>% - srckeep(c("default_12m","harp_12m","monthly.rpt.prd")) %>% - group_by(monthly.rpt.prd, hard = F) %>% - summarise( - N = n(), - n_defaults = sum(default_12m, na.rm = T), - n_goto_harp = sum(harp_12m, na.rm=T)) %>% - collect(parallel = T) %>% - group_by(monthly.rpt.prd) %>% - summarise( - odr = sum(n_defaults)/sum(N), - oh = sum(n_goto_harp)/sum(N) - ) %>% - rename( - Date = monthly.rpt.prd, - `Observed Default Rate%` = odr, - `HARP Conversion Rate%` = oh - )) - - -if(F) { - # data.table syntax - system.time(a_wh <- tmp4[ - ,.(.N, n_defaults = sum(default_12m, na.rm = T), ) - ,.(monthly.rpt.prd), keep=c("default_12m","harp_12m","monthly.rpt.prd")]) - - a_wh1 = a_wh[,.(odr = sum(n_defaults)/sum(N), oh = sum(n_goto_harp)/sum(N)), monthly.rpt.prd] - setnames(a_wh1, c("monthly.rpt.prd","odr", "oh"), c("Date", "Observed Default Rate%", "HARP Conversion Rate%")) -} - -a_wh2 = a_wh1 %>% gather(key = type, value=rate, -Date) - -ggplot(a_wh2) + - geom_line(aes(x=Date, y = rate, colour = type)) + - ggtitle("Fannie Mae Observed Default Rate over time & HARP Conversion Rate") - - diff --git a/tutorials/fannie_mae_new/02d_harp_check.r b/tutorials/fannie_mae_new/02d_harp_check.r deleted file mode 100644 index f9fd020a..00000000 --- a/tutorials/fannie_mae_new/02d_harp_check.r +++ /dev/null @@ -1,33 +0,0 @@ -#2d_harp_check.r -source("inst/fannie_mae/0_setup.r") - -system.time(harp <- fread("D:/data/fannie_mae/HARP_Files/Performance_HARP.txt", select = c("V1","V2"), colClasses = "c")) -setnames(harp,names(harp), c("harp_id","month")) - -harp_mapping <- fread("D:/data/fannie_mae/HARP_Files/Loan_Mapping.txt", colClasses = "c", col.names = c("loan_id", "harp_id")) -fst::write_fst(harp_mapping,"harp_mapping.fst") - -harp_mapped = merge(harp, harp_mapping, by="harp_id") -c(nrow(harp), nrow(harp_mapped)) - -harp_acq <- fread("d:/data/fannie_mae/HARP_Files/Acquisition_HARP.txt") -fst::write_fst(harp_mapping,"harp_mapping.fst") - -harp[,date:=as.Date(month,"%m/%d/%Y")] -harp[,.N,date][order(date)] - -fmdf = disk.frame("fmdf") - -system.time(uid <- fmdf[,.(loan_id = unique(loan_id)), keep = "loan_id"][,unique(loan_id)]) - -harp_uid = unique(harp_mapped$loan_id) - -def = intersect(uid, harp_uid) -def - -system.time(harp.df <- shard(harp,"loan_id", nchunks = nchunks(fmdf), outdir = "harp.df", overwrite = T)) - -system.time(fmdfh <- rbindlist.disk.frame(list(fmdf, harp.df), outdir = "fmdf1_w_harp")) - -# nothing in the intersection -# so either HARP ids were changed once there are in the harp program \ No newline at end of file diff --git a/tutorials/fannie_mae_new/10_a_one_var.R b/tutorials/fannie_mae_new/10_a_one_var.R deleted file mode 100644 index fae7540c..00000000 --- a/tutorials/fannie_mae_new/10_a_one_var.R +++ /dev/null @@ -1,191 +0,0 @@ -source("inst/fannie_mae/0_setup.r") -library(disk.frame) - -acqall1 = disk.frame(file.path(outpath, "appl_mdl_data_sampled_dev2")) - -# XGBoost on one var ------------------------------------------------------ -system.time(xy <- acqall1[,c("default_next_12m", "oltv"), keep=c("default_next_12m", "oltv")]) - -# transform into format accepted by XGBoost -dtrain <- xgb.DMatrix(label = xy$default_next_12m, data = as.matrix(xy[,"oltv"])) - -portfolio_default_rate = xy[,sum(default_next_12m, na.rm = T)/.N] - -pt = proc.time() -m <- xgboost( - data=dtrain, - nrounds = 1, - objective = "binary:logitraw", - tree_method="exact", - monotone_constraints = 1, - base_score = portfolio_default_rate) -timetaken(pt) - -prev_pred = predict(m, dtrain) - -map_chr(xgb.dump(m), ~str_extract(.x,"\\[f0<[\\d]+\\.[\\d]+\\]")) %>% - keep(~!is.na(.x)) %>% - map_dbl(~str_extract(.x, "[\\d]+\\.[\\d]+") %>% as.numeric) %>% - sort %>% - floor -> bins - -bb = xy[,.(ndef = sum(default_next_12m), .N, m1 = min(oltv), m2 = max(oltv)), .(bins = cut(oltv,c(-Inf,bins,Inf)))] -new_bins = sort(unique( bb$m2)) -bb = xy[,.(ndef = sum(default_next_12m), .N), .(bins = cut(oltv,c(-Inf,new_bins,Inf)))][order(bins)] - -setkey(bb, bins) -bb %>% - filter(!is.na(bins)) %>% - mutate(`Orig LTV Band` = bins, `Default Rate%` = ndef/N) %>% - ggplot + - geom_bar(aes(x = `Orig LTV Band`, y = `Default Rate%`), stat = 'identity') + - coord_flip() - -if(F) { - system.time(xyz <- acqall1[,c("default_next_12m", "oltv", "frst_dte"), keep=c("default_next_12m", "oltv", "frst_dte")]) - - xyz[,frst_yr := substr(frst_dte,4,7) %>% as.integer] - xyz[,frst_dte:=NULL] - - bb = xyz[,.(ndef = sum(default_next_12m), .N), .(bins = cut(oltv,c(-Inf,bins,Inf)), frst_yr)] - - bb[,binton := sum(N), bins] - bb[,dr := ndef/binton] - - setkey(bb, bins) - bb[order(frst_yr,decreasing = T),text_y_pos := cumsum(dr) - dr/2, bins] - bb[,text := substr(frst_yr, 3,4)] - - bb %>% - filter(!is.na(bins)) %>% - mutate(`Yr of Orig` = as.factor(frst_yr), `Orig LTV Band` = bins, `Default Rate%` = dr) %>% - ggplot + - geom_bar(aes(x = `Orig LTV Band`, y = `Default Rate%`, fill = `Yr of Orig`), stat = 'identity') + - geom_text(aes(x = `Orig LTV Band`, y = text_y_pos, label = text)) + - coord_flip() -} - - -# what if we fit them in rounded number --------------------------------------------------------- - -xy[oltv > 80, oltv_round := ceiling(oltv/5)*5] -xy[oltv <= 80, oltv_round := ceiling(oltv/10)*10] -xy[oltv <= 40, oltv_round := ceiling(oltv/20)*20] -#xy = xy[!is.na(oltv_round),] - -xy[is.na(oltv_round),] - -xy[,.N, oltv_round] - -# transform into format accepted by XGBoost -dtrain <- xgb.DMatrix(label = xy$default_next_12m, data = as.matrix(xy[,"oltv_round"])) - -pt = proc.time() -m <- xgboost( - data=dtrain, - nrounds = 1, - objective = "binary:logitraw", - tree_method="exact", - monotone_constraints = 1, - base_score = portfolio_default_rate) -timetaken(pt) - -map_chr(xgb.dump(m), ~str_extract(.x,"\\[f0<[\\d]+[\\.]{0,1}[\\d]+\\]")) %>% - keep(~!is.na(.x)) %>% - map_dbl(~str_extract(.x, "[\\d]+[\\.]{0,1}[\\d]+") %>% as.numeric) %>% - sort -> bins - -bb = xy[,.(ndef = sum(default_next_12m), .N, m1 = min(oltv_round), m2 = max(oltv_round)), .(bins = cut(oltv_round,c(-Inf,bins,Inf)))] -new_bins = sort(unique( bb$m2)) -bb = xy[,.(ndef = sum(default_next_12m), .N), .(bins = cut(oltv_round,c(-Inf,new_bins,Inf)))][order(bins)] -bb[,odr := ndef/N] - -setkey(bb, bins) -bb %>% - filter(!is.na(bins)) %>% - mutate(`Orig LTV Band` = bins, `Default Rate%` = ndef/N) %>% - ggplot + - geom_bar(aes(x = `Orig LTV Band`, y = `Default Rate%`), stat = 'identity') + - coord_flip() - - -prev_pred = predict(m, dtrain) - -prev_pred1 = predict(m, dtrain, predcontrib=T) - -# add another variable ---------------------------------------------------- -target = "default_next_12m" -feature = "orig_amt" -df = acqall1 -format_fn = base::I -existing_model = prev_pred -monotone_constraints = -1 - -# auc <- function(target, score) { -# df = data.table(target, score) -# -# df1 = df[,.(nt = sum(target), n = .N, score)] -# setkey(df1, score) -# } -# -# add_var_to_scorecard <- function(df, target, feature, monotone_constraints = 0, prev_pred = NULL, format_fn = base::I) { -# -# xy = df %>% -# srckeep(c(target, feature)) %>% -# collect(parallel = T) -# -# # evaluate -# code = glue::glue("xy = xy %>% mutate({feature} = format_fn({feature}))") -# eval(parse(text = code)) -# -# dtrain <- xgb.DMatrix(label = xy[,target, with = F][[1]], data = as.matrix(xy[,c(feature), with = F])) -# -# if(is.null(prev_pred)) { -# pt = proc.time() -# m2 <- xgboost( -# data=dtrain, -# nrounds = 1, -# objective = "binary:logitraw", -# tree_method="exact", -# monotone_constraints = monotone_constraints -# ) -# timetaken(pt) -# } else { -# setinfo(dtrain, "base_margin", prev_pred) -# pt = proc.time() -# m2 <- xgboost( -# data=dtrain, -# nrounds = 1, -# objective = "binary:logitraw", -# tree_method="exact", -# monotone_constraints = monotone_constraints -# ) -# timetaken(pt) -# -# a2 = predict(m2, dtrain) -# a3 = predict(m2, dtrain, predcontrib = T) -# } -# -# map_chr(xgb.dump(m2), ~str_extract(.x,"\\[f0<[\\d]+[\\.]{0,1}[\\d]+\\]")) %>% -# keep(~!is.na(.x)) %>% -# map_dbl(~str_extract(.x, "[\\d]+[\\.]{0,1}[\\d]+") %>% as.numeric) %>% -# sort -> bins -# -# code = glue::glue("bb = xy[,.(ndef = sum({target}), .N, m1 = min({feature}), m2 = max({feature})), .(bins = cut({feature},c(-Inf,bins,Inf)))]") -# eval(parse(text = code)) -# -# new_bins = sort(unique(bb$m2)) -# code1 = glue::glue("bb = xy[,.(ndef = sum(default_next_12m), .N), .(bins = cut({feature},c(-Inf,new_bins,Inf)))][order(bins)]") -# eval(parse(text = code1)) -# -# setkey(bb, bins) -# bb %>% -# filter(!is.na(bins)) %>% -# mutate(`Orig LTV Band` = bins, `Default Rate%` = ndef/N) %>% -# ggplot + -# geom_bar(aes(x = `Orig LTV Band`, y = `Default Rate%`), stat = 'identity') + -# coord_flip() -# -# -# } - diff --git a/tutorials/fannie_mae_new/10_a_one_var_with_fn.R b/tutorials/fannie_mae_new/10_a_one_var_with_fn.R deleted file mode 100644 index 687abcc7..00000000 --- a/tutorials/fannie_mae_new/10_a_one_var_with_fn.R +++ /dev/null @@ -1,56 +0,0 @@ -source("inst/fannie_mae/0_setup.r") -library(disk.frame) - -acqall1 = disk.frame(file.path(outpath, "appl_mdl_data_sampled_dev2")) - -pt <- proc.time() -firstm <- add_var_to_scorecard(acqall1, "default_next_12m", "oltv", monotone_constraints = 1, format_fn = function(v) { - ceiling(v / 5) * 5 -}) -timetaken(pt) - -firstm - -# now add the second variable --------------------------------------------- -pt <- proc.time() -secondm <- add_var_to_scorecard( - acqall1, - "default_next_12m", - "dti", - prev_pred = firstm$prev_pred, - monotone_constraints = 1) -timetaken(pt) - -pt <- proc.time() -thirdm <- add_var_to_scorecard( - acqall1, - "default_next_12m", - "orig_trm", - prev_pred = secondm$prev_pred, - monotone_constraints = 1) -timetaken(pt) - -pt <- proc.time() -fourthm <- add_var_to_scorecard( - acqall1, - "default_next_12m", - "num_bo", - prev_pred = thirdm$prev_pred, - monotone_constraints = -1, - format_fn = as.numeric) -timetaken(pt) - -pt <- proc.time() -fifthm <- add_var_to_scorecard( - acqall1, - "default_next_12m", - "cscore_b", - prev_pred = fourthm$prev_pred, - monotone_constraints = -1) -timetaken(pt) - - -# plot improvement -------------------------------------------------------- -plot(map_dbl(list(firstm, secondm, thirdm, fourthm, fifthm), ~.x$auc), type="b", ylim = c(0.4,0.9)) -abline(h = 0.7, lty = 2) - diff --git a/tutorials/fannie_mae_new/10_b_two_vars.r b/tutorials/fannie_mae_new/10_b_two_vars.r deleted file mode 100644 index d0df6cd4..00000000 --- a/tutorials/fannie_mae_new/10_b_two_vars.r +++ /dev/null @@ -1,61 +0,0 @@ -source("inst/fannie_mae/0_setup.r") -library(disk.frame) - -acqall1 = disk.frame(file.path(outpath, "appl_mdl_data_sampled_dev2")) - -pt <- proc.time() -firstm <- add_var_to_scorecard( - acqall1, - "default_next_12m", - "oltv", - monotone_constraints = 1, - format_fn = function(v) { - ceiling(v / 5) * 5 - }) -timetaken(pt) - -firstm - -# now add the second variable --------------------------------------------- -pt <- proc.time() -secondm <- add_var_to_scorecard( - acqall1, - "default_next_12m", - "dti", - prev_pred = firstm$prev_pred, - monotone_constraints = 1) -timetaken(pt) - -pt <- proc.time() -thirdm <- add_var_to_scorecard( - acqall1, - "default_next_12m", - "orig_trm", - prev_pred = secondm$prev_pred, - monotone_constraints = 1) -timetaken(pt) - -pt <- proc.time() -fourthm <- add_var_to_scorecard( - acqall1, - "default_next_12m", - "num_bo", - prev_pred = thirdm$prev_pred, - monotone_constraints = -1, - format_fn = as.numeric) -timetaken(pt) - -pt <- proc.time() -fifthm <- add_var_to_scorecard( - acqall1, - "default_next_12m", - "cscore_b", - prev_pred = fourthm$prev_pred, - monotone_constraints = -1) -timetaken(pt) - - -# plot improvement -------------------------------------------------------- -#plot(map_dbl(list(firstm, secondm, thirdm, fourthm, fifthm), ~.x$auc), type="b", ylim = c(0.4,0.9)) -#abline(h = 0.7, lty = 2) - diff --git a/tutorials/fannie_mae_new/10a_AutoML_XGBoost_Scorecards.r b/tutorials/fannie_mae_new/10a_AutoML_XGBoost_Scorecards.r deleted file mode 100644 index 6f7a471e..00000000 --- a/tutorials/fannie_mae_new/10a_AutoML_XGBoost_Scorecards.r +++ /dev/null @@ -1,90 +0,0 @@ -source("inst/fannie_mae/0_setup.r") -library(disk.frame) - -acqall_dev = disk.frame(file.path(outpath, "appl_mdl_data_sampled_dev2")) - - -# 5d develop a function to test all variables -check_which_is_best <- function(df, target, features, monotone_constraints, format_fns, weight=NULL) { - #browser() - prev_pred = NULL - ws = NULL - vars_scr = NULL - - while(length(features) > 0) { - # try every var - #browser() - res = furrr::future_map(1:length(features), ~add_var_to_scorecard( - #res = purrr::map(1:length(features), ~add_var_to_scorecard( - df, - target, - features[.x], - monotone_constraints = monotone_constraints[.x], - prev_pred = prev_pred, - format_fn = format_fns[[.x]], - weight, - save_model_fname = glue::glue("{features[.x]}.xgbm") - )) - - # which has the best auc - w = which.max(map_dbl(res, ~.x$auc)) - ws = c(ws, w) - feature_to_fit = features[w] - print(glue::glue("choosen: {feature_to_fit}")) - - var_scr1 = res[[w]] - eval(parse(text = glue::glue("vars_scr = c(vars_scr, list({feature_to_fit} = var_scr1))"))) - - features = features[-w] - format_fns = format_fns[-w] - monotone_constraints = monotone_constraints[-w] - - prev_pred = var_scr1$prev_pred - } - vars_scr -} - -num_vars = c("mi_pct", "orig_amt", "orig_rt", "ocltv", "mi_type", "cscore_c", "oltv", "dti", "orig_trm", "cscore_b", "num_bo", "num_unit", "orig_dte", "frst_dte") -num_vars_mon = c(0 , 1 , 1 , 1 , 0 , -1 , 1 , 1 , 1 , -1 , -1 , 0 , 0 , 0) -num_var_fmt_fn = c(map(1:10, ~base::I), map(1:2, ~as.numeric), map(1:2, ~function(x) { - as.numeric(substr(x, 4, 7)) -})) - -cat_vars = setdiff(names(acqall_dev), num_vars) %>% - setdiff( - c("loan_id", "default_next_12m", "first_default_date", "mths_to_1st_default", "zip_3", "weight")) - -pt = proc.time() -res_all = check_which_is_best( - acqall_dev, - "default_next_12m", - features = c(num_vars, cat_vars), - monotone_constraints = c(num_vars_mon, rep(0, length(cat_vars))), - format_fns = c(num_var_fmt_fn, map(1:length(cat_vars), ~base::I)), - weight = "weight" -) -timetaken(pt) - -plot(map_dbl(res_all, ~.x$auc)) - -#saveRDS(res_all, "model.rds") - -if(F) { - rescat = check_which_is_best( - acqall_dev, - "default_next_12m", - c("seller.name", "oltv", "state", "dti", "mi_type"), - c(0 , 1 , 0 , 1 , 0 ), - map(1:5, ~base::I) - ) - - - system.time(res <- check_which_is_best( - acqall1, - "default_next_12m", - c("ocltv", "cscore_c", "mi_type", "oltv", "dti", "orig_trm", "cscore_b", "num_bo"), - c(1 , -1 , 0 , 1 , 1 , 1 , -1 , -1 ), - c(map(1:7, ~base::I), list(as.numeric)))) -} - - diff --git a/tutorials/fannie_mae_new/10b_make_scorecard_visible.R b/tutorials/fannie_mae_new/10b_make_scorecard_visible.R deleted file mode 100644 index 36fb832d..00000000 --- a/tutorials/fannie_mae_new/10b_make_scorecard_visible.R +++ /dev/null @@ -1,109 +0,0 @@ -source("inst/fannie_mae/0_setup.r") -library(disk.frame) - -acqall_val = disk.frame(file.path(outpath, "appl_mdl_data_sampled_val2")) -df = acqall_val -#the model -mdl = readRDS("model.rds") - -plot(purrr::map_dbl(mdl, ~.x$auc)) - -#' score one variable -score_one_var <- function(feature, x, bins, bias, format_fn = base::I) { - print(feature) - # numeric features (post format_fn) has only two columns in bins - if(ncol(bins) == 2) { - x1 = format_fn(x) - binsNA = evalparseglue("bins[is.na({feature}),]") - bins = evalparseglue("bins[!is.na({feature}),]") - score = bins$score[cut(x1, c(-Inf, bins[[2]], Inf))] - - if(nrow(binsNA) == 1) { - score[is.na(x1)] = binsNA[[1]] - } - } else { - x1 = format_fn(x) - x1df = evalparseglue("data.table({feature} = x1)") - x1df_scr = evalparseglue("merge(x1df, bins, by = '{feature}', all.x=T)") - score = x1df_scr$score - } - # if there are still na then assign the bias - #browser(expr = any(is.na(score))) - if(any(is.na(score))) { - warning(glue::glue("the feature `{feature}` has {sum(is.na(score))} unassigned scores out of {length(score)}, assigning them the bias = {bias}")) - score[is.na(score)] = bias - } - - score -} - - -#' Socre a scorecard -#' @param mdl the xg scorecard model -#' @param df a disk.frame -score_xg_scorecard <- function(df, mdl) { - res = furrr::future_map_dfc(mdl, ~{ - #res = purrr::map_dfc(mdl, ~{ - var = df %>% - srckeep(.x$feature) %>% - collect(parallel = F) - x = var[[1]] - feature = .x$feature - bins = .x$bins - bias = .x$bias - format_fn = .x$format_fn - score_one_var(feature, x, bins, bias, format_fn) - }) %>% rowSums - res -} - -plot_auc <- function(df) { - target = df %>% - srckeep("default_next_12m") %>% - collect(parallel = F) - - pt = proc.time() - target$score = score_xg_scorecard(df, mdl) - timetaken(pt) - - AUC = auc(target$default_next_12m, target$score) - GINI = 2*AUC-1 - - target[,negscore := -score] - setkey(target, negscore) - cts = target[,.(ctot = (1:.N)/.N, cbad = cumsum(default_next_12m)/sum(default_next_12m))] - - plot(cts[seq(1,.N, length.out=100),], type="l", xlim=c(0,1), - main=glue::glue("On Validation - AUC: {AUC %>% round(2)}; GINI: {GINI %>% round(2)}")) - abline(a=0,b=1,lty=2) - abline(h=0) - abline(v=0) -} - - -system.time(plot_auc(acqall_val)) - -# score on whole ---------------------------------------------------------- -acqall_dev = disk.frame(file.path(outpath, "appl_mdl_data_sampled_dev2")) -system.time(plot_auc(acqall_dev)) - -scorecard = map_dfr(mdl, ~{ - res = .x$bins - evalparseglue("res[,feature_lbl := as.character({.x$feature})]") - res[,variable := .x$feature] - res %>% - select(variable, feature_lbl, score ) %>% - mutate(score = round(-score*20/log(2))) -}) - -saveRDS(scorecard, "scorecard.rds") -scorecard = readRDS("scorecard.rds") -View(scorecard) - -DT::datatable(scorecard) - -# score on whole ---------------------------------------------------------- -# acqall = disk.frame(file.path(outpath, "appl_mdl_data")) -# system.time(plot_auc(acqall)) - - diff --git a/tutorials/fannie_mae_new/11_speedglm.R b/tutorials/fannie_mae_new/11_speedglm.R deleted file mode 100644 index a7355ac9..00000000 --- a/tutorials/fannie_mae_new/11_speedglm.R +++ /dev/null @@ -1,50 +0,0 @@ -source("inst/fannie_mae/0_setup.r") -library(disk.frame) -library(speedglm) -library(biglm) - -acqall_dev = disk.frame(file.path(outpath, "appl_mdl_data_sampled_dev2")) -library(speedglm) - -head(acqall_dev) - -#' A streaming function for speedglm -#' @param df a disk.frame -stream_shglm <- function(df) { - i = 0 - is = sample(nchunks(df), replace = F) - function(reset = F) { - #browser() - if(reset) { - print("you've reset") - i <<- 0 - } else { - i <<- i + 1 - #if (i > 3) { - if (i > nchunks(df)) { - return(NULL) - } - print(glue::glue("{i}/{nchunks(df)}")) - return(get_chunk(df, is[i])) - #return(get_chunk(df, i)) - } - } -} - -acqall_dev1 = acqall_dev %>% delayed(~{ - .x[,oltv_band := cut(oltv, c(-Inf, seq(60,100,by=20)))] - .x[,dti_band := addNA(cut(dti, c(-Inf, c(20,100), Inf)))] - .x -}) - -streamacq = stream_shglm(acqall_dev1) - -m = bigglm( - default_next_12m ~ oltv_band-1, - data = streamacq, - family=binomial()) -summary(m) - -shglm(default_next_12m ~ oltv_band -1 #+ dti_band + - 1 - , - datafun = streamacq, family=binomial()) \ No newline at end of file diff --git a/tutorials/fannie_mae_new/12_keras.r b/tutorials/fannie_mae_new/12_keras.r deleted file mode 100644 index ba164e2c..00000000 --- a/tutorials/fannie_mae_new/12_keras.r +++ /dev/null @@ -1,178 +0,0 @@ -source("inst/fannie_mae/0_setup.r") -library(disk.frame) -library(keras) - -acqall_dev = disk.frame(file.path(outpath, "appl_mdl_data_sampled_dev2")) - -#' A streaming function for speedglm -#' @param df a disk.frame -stream_shglm <- function(df) { - i = 0 - is = sample(nchunks(df), replace = F) - function(reset = F) { - #browser() - if(reset) { - print("you've reset") - i <<- 0 - } else { - i <<- i + 1 - #if (i > 4) { - if (i > nchunks(df)) { - return(NULL) - } - print(glue::glue("{i}/{nchunks(df)}")) - return(get_chunk(df, is[i])) - #return(get_chunk(df, i)) - } - } -} - -acqall_dev1 = acqall_dev %>% delayed(~{ - .x[,oltv_band := cut(oltv, c(-Inf, 60,80, Inf))] - #.x[,scr_band := addNA(cut(cscore_b, c(-Inf, 627,700, Inf)), ifany=F)] - .x[,scr_band := addNA(cut(cscore_b, c(-Inf, 700,716,725,742,748,766,794, Inf)), ifany=F)] - - .x -}) - -if(F) { - aa = acqall_dev1 %>% collect - glm(default_next_12m ~ oltv_band + scr_band - 1, data=aa) -} - -head(acqall_dev1) - -build_model <- function() { - model <- keras_model_sequential() %>% - layer_dense(units = 2, activation = 'softmax') - - - model %>% compile( - loss = "categorical_crossentropy", - optimizer = 'sgd' - ) - - model -} - -model = build_model() - -#ol = levels(acqall_dev1[,oltv_band, keep=c("oltv","cscore_b")]) -#dl = levels(acqall_dev1[,scr_band, keep=c("oltv","cscore_b")]) - -ol = levels(get_chunk(acqall_dev1,1)[,oltv_band]) -dl = levels(get_chunk(acqall_dev1,1)[,scr_band]) - - -#for(i in 1:nchunks(acqall_dev1)) { -kk <- function() { - #browser() - j = 0 - done = F - ii = 0 - while(!done) { - j <- j + 1 - si = sample(nchunks(acqall_dev1), nchunks(acqall_dev1)*0.7) - osi = setdiff(1:nchunks(acqall_dev1), si) - #browser() - system.time(a <- map_dfr(si, ~{ - #browser() - ii <- ii + 1 - i = .x - if(ii %% 20 == 0) print(glue::glue("{j}:{ii} {Sys.time()}")) - - a = get_chunk(acqall_dev1, i) - #a[,.(sum(default_next_12m), .N),oltv_band] - - a1 = - cbind( - a[,keras::to_categorical(as.integer(oltv_band) - 1)] - #,a[,keras::to_categorical(as.integer(scr_band) - 1)] - ) - - at = a[,default_next_12m*1] - Y_train = keras::to_categorical(at) - - hist = model %>% fit( - a1, - Y_train, - epochs = 1, - validation_split = 0.2, - verbose = 0 - ) - - gw = get_weights(model) - - gwdt = as.data.table(gw[1]) - setnames(gwdt, names(gwdt), c("non_default", "default")) - gwdt$i =i - gwdt$var = c( - rep("oltv_band", length(ol)) - #, rep("scr_band", length(dl)) - ) - gwdt$band = c(ol,dl - ) - gwdt - })) - - some_chunks <- map(osi, ~{ - i = .x - a = get_chunk(acqall_dev1, i) - - a1 = - cbind( - a[,keras::to_categorical(as.integer(oltv_band) -1)] - #,a[,keras::to_categorical(as.integer(scr_band)-1)] - ) - a1 - }) %>% reduce(rbind) - - outcomes <- map(osi, ~{ - i = .x - a = get_chunk(acqall_dev, i, keep=c("default_next_12m")) - a[,default_next_12m] - }) %>% unlist - - auc = auc(outcomes, predict(model, some_chunks)[,2]) - - # scores - p = predict(model, diag( - length(ol) - #+length(dl) - ))[,2] - a_b = log(p/(1-p)) - - scalar = 20/log(2) - intercept = 20*log(536870912/15)/log(2) - - a = mean(a_b) - b = a_b - a - done <- length(unique(sign(diff(b)))) == 1 - - scrs = data.table(base_scr = round(a*scalar+intercept,0), scr = round(-b*scalar,0)) - scrs$var = c( - rep("oltv_band", length(ol)) - #, rep("scr_band", length(dl)) - ) - scrs$band = c(ol - , dl - ) - scrs = scrs[band != "bias"] - - scrs = scrs[,.(var, band, scr, base_scr)] - print(glue::glue("AUC: {auc}")) - print(scrs) - } - scrs -} - -pt = proc.time() -scrs = kk() -timetaken(pt) -View(scrs) - -#AUC: 0.599790504924901 -# var band scr base_scr -# 1: oltv_band (-Inf,60] 10 351 -# 2: oltv_band (60,80] 10 351 -# 3: oltv_band (80, Inf] -21 351 \ No newline at end of file diff --git a/tutorials/fannie_mae_new/13_disk.r b/tutorials/fannie_mae_new/13_disk.r deleted file mode 100644 index 10adc926..00000000 --- a/tutorials/fannie_mae_new/13_disk.r +++ /dev/null @@ -1,25 +0,0 @@ -source("inst/fannie_mae/0_setup.r") -library(disk.frame) -library(keras) - -fmdf = disk.frame(file.path(outpath, "fm_with_harp")) - -head(fmdf) - -nrow(fmdf) - -pt = proc.time() -default_rate_over_time <- fmdf %>% - srckeep(c("default_12m", "monthly.rpt.prd")) %>% - group_by(monthly.rpt.prd, hard = F) %>% - summarise(ndef = sum(default_12m, na.rm=T), n = n()) %>% - collect %>% - group_by(monthly.rpt.prd) %>% - summarise(ndef = sum(ndef), n = sum(n)) %>% - mutate(default_rate = ndef/n) -timetaken(pt) - -default_rate_over_time %>% - ggplot + - geom_line(aes(x = monthly.rpt.prd, y = default_rate)) + - ggtitle("Fannie Mae Default Rate%") diff --git a/tutorials/fannie_mae_new/3a_get_some_data.r b/tutorials/fannie_mae_new/3a_get_some_data.r deleted file mode 100644 index b86128a8..00000000 --- a/tutorials/fannie_mae_new/3a_get_some_data.r +++ /dev/null @@ -1,113 +0,0 @@ -library(disk.frame) - -source("inst/fannie_mae/0_setup.r") - -a = disk.frame(file.path(outpath, "fm_with_harp")) - -pt = proc.time() -a1 <- a %>% - srckeep(c("loan_id","monthly.rpt.prd","delq.status", "default_12m")) %>% - delayed(~{ - setkey(.x, loan_id) - uid = sample_frac(.x[,.(loan_id = unique(loan_id))], 0.01) - .x[uid,] -}) %>% collect(parallel = T) -timetaken(pt) - -a1[delq.status != "X",delq.statusn := as.numeric(delq.status)] -a1[delq.status %in% c("","X"),delq.statusn := 0] - -#a1[, delq.statusn := pmin(delq.statusn, 3)] - -a1[,.N,delq.statusn] - -a1[is.na(delq.statusn)] - -# remove those that have already defulat -a2 = a1 - -a2[is.na(default_12m), default_12m := FALSE] - -a3 = a2[,.(loan_id, monthly.rpt.prd, delq.statusn, default_12m)] -setkey(a3, loan_id, monthly.rpt.prd) - -a3[,sum(default_12m)/.N,delq.statusn] - -# create worst delq status last 12 months -system.time(a4 <- a3[, shift(delq.statusn,n=1:12), by=loan_id]) - -eval(parse(text=glue::glue("a4[,worst_delq_last_12m := pmax({paste0(paste0('V', 1:12), collapse=',')}, na.rm=T)]"))) - -a5 = bind_cols(a3, a4[,.(worst_delq_last_12m)]) - -a6 = a5#[delq.statusn < 3,] - -a6[,worst_delq_last_12m_capped := pmin(worst_delq_last_12m, 6)] - -a6[,.N, worst_delq_last_12m_capped] - -a6devid = sample_frac(a6[,.(loan_id = unique(loan_id))], 0.7) - -a6dev = a6[a6devid] -a6val = a6[!a6devid] - -a6devpd = a6dev[,.(pd = sum(default_12m)/.N), worst_delq_last_12m_capped] - -a6val = left_join(a6val, a6devpd, by = "worst_delq_last_12m_capped") -a6dev = left_join(a6dev, a6devpd, by = "worst_delq_last_12m_capped") - -setkey(a6val, loan_id, monthly.rpt.prd) -setkey(a6dev, loan_id, monthly.rpt.prd) - -for_write <- function(a6val) { - a6val1 = a6val[as.Date("2015-06-01") < monthly.rpt.prd & monthly.rpt.prd <= as.Date("2016-06-01") & !is.na(worst_delq_last_12m_capped), ] - a6val1[,N:=.N,loan_id] - - a6val2 = a6val1[N == 12,] - setkey(a6val2, loan_id, monthly.rpt.prd) - - toremove = a6val2[,.SD[12,delq.statusn], loan_id] - - toremove1 = toremove[V1 >= 3, loan_id] - - a6val2 = a6val2[!(loan_id %in% toremove1), ] - - a6val2[order(loan_id), id := rleid(loan_id)] - setkey(a6val2, loan_id, monthly.rpt.prd) - a6val2 -} - - -a6val2 <- for_write(a6val) -a6dev2 <- for_write(a6dev) - - -a6dev2[,.N, delq.statusn] -a6val2[,.N, delq.statusn] - -fwrite(a6val2[,.(id, delq_status = delq.statusn, default_12m)], "val.csv") -fwrite(a6dev2[,.(id, delq_status = delq.statusn, default_12m)], "dev.csv") - - - -gini <- function(a6val) { - setkey(a6val, id, monthly.rpt.prd) - a6val = a6val[seq(12,.N, by=12),] - a6_auc = a6val[order(worst_delq_last_12m_capped), .(bads = sum(default_12m), tots =.N), pd] - - a6_auc[,score:=-pd] - setkey(a6_auc, score) - - a6_auc[order(score), height := bads/sum(bads)] - a6_auc[order(score), width := tots/sum(tots)] - plot(a6_auc[,.(cumsum(width), cumsum(height))], type="l") - abline(0,1) - a6_auc2 = bind_rows(data.table(height=0, width=0), a6_auc) - a6_auc2[,cheight := cumsum(height)] - a6_auc2[,cwidth := cumsum(width)] - 2*a6_auc2[,sum((cheight+lag(cheight))*c(0,diff(cwidth))/2, na.rm=T)]-1 -} - -gini(a6val2) -gini(a6dev2) - diff --git a/tutorials/fannie_mae_new/4a_read_appl.r b/tutorials/fannie_mae_new/4a_read_appl.r deleted file mode 100644 index 29483b2f..00000000 --- a/tutorials/fannie_mae_new/4a_read_appl.r +++ /dev/null @@ -1,59 +0,0 @@ -source("tutorials/fannie_mae/0_setup.r") -library(disk.frame) - -fs::dir_delete(file.path(outpath, "acq.df")) - - -Acquisitions_Variables = c("LOAN_ID", "ORIG_CHN", "Seller.Name", "ORIG_RT", "ORIG_AMT", "ORIG_TRM", "ORIG_DTE" - ,"FRST_DTE", "OLTV", "OCLTV", "NUM_BO", "DTI", "CSCORE_B", "FTHB_FLG", "PURPOSE", "PROP_TYP" - ,"NUM_UNIT", "OCC_STAT", "STATE", "ZIP_3", "MI_PCT", "Product.Type", "CSCORE_C", "MI_TYPE", "RELOCATION_FLG") %>% tolower() - -Acquisition_ColClasses = c("character", "character", "character", "numeric", "numeric", "integer", "character", "character", "numeric", - "numeric", "character", "numeric", "numeric", "character", "character", "character", "character", "character", - "character", "character", "numeric", "character", "numeric", "numeric", "character") - - - -pt = proc.time() -acq <- zip_to_disk.frame( - acqzip_file_path, - outdir = file.path(outpath, "acq.df"), - col.names = Acquisitions_Variables, - colClasses = Acquisition_ColClasses, - shardby = "loan_id", - nchunks = nchunks(fmdf)) -timetaken(pt) - -pt = proc.time() -acqall <- rbindlist.disk.frame(acq, outdir = file.path(outpath, "acq_all.df")) -timetaken(pt) - -# took 50 minutes - -fmdf <- disk.frame(file.path(outpath,"fm_with_harp")) - - -pt = proc.time() -fmdf_all = left_join( - fmdf, - acqall, - by = "loan_id", - merge_by_chunk_id = T, - outdir = file.path(outpath, "fmdf_appl") - ) -timetaken(pt) - -acqall = disk.frame(file.path(outpath, "acq_all.df")) - -fmdf <- disk.frame(file.path(outpath,"fm_with_harp")) - - -for(i in 1:nchunks(fmdf)) { - print(i) - a1 = get_chunk(acqall,i, keep="loan_id") %>% unique - a2 = get_chunk(fmdf,i, keep="loan_id") %>% unique - stopifnot(nrow(a1) == nrow(a2)) - stopifnot(nrow(setdiff(a1,a2)) == 0) - stopifnot(nrow(setdiff(a2,a1)) == 0) -} - diff --git a/tutorials/fannie_mae_new/4b_exploratory.R b/tutorials/fannie_mae_new/4b_exploratory.R deleted file mode 100644 index 1cf4382d..00000000 --- a/tutorials/fannie_mae_new/4b_exploratory.R +++ /dev/null @@ -1,92 +0,0 @@ -source("inst/fannie_mae/0_setup.r") -library(disk.frame) -#future::plan(multiprocess(workers = availableCores())) - -fmdf_all = disk.frame(file.path(outpath, "fmdf_appl")) - -head(fmdf_all) - -# how many delq.status are there ------------------------------------------ -# system.time(dscnt <- fmdf_all[,.N,delq.status, keep="delq.status"]) -#dscnt[order(delq.status),sum(N),delq.status %>% as.integer] - -system.time(dscnt2 <- fmdf_all %>% - srckeep(c("delq.status", "default_12m", "harp_12m")) %>% - group_by(delq.status, hard = F) %>% - summarise(n = n(), ndef = sum(default_12m, na.rm=T), nharp = sum(harp_12m, na.rm=T)) %>% - collect(parallel = T) %>% - group_by(delq.status) %>% - summarise(n = sum(n), ndef = sum(ndef), nharp = sum(nharp)) %>% - arrange(delq.status)) - - -dscnt2[delq.status %in% c("X","","0","1","2"),.(delq.status, ndef/n)] - -# how many default and go to HARP ----------------------------------------- -# system.time(hd <- fmdf_all[,.N, .(harp_12m,default_12m), keep=c("harp_12m","default_12m")]) -# hd[,.(N = sum(N)), .(harp_12m, default_12m)] - -# 45 seconds -system.time(hd <- fmdf_all %>% - srckeep(c("harp_12m", "default_12m", "delq.status")) %>% - filter(delq.status %in% c("X","","0","1","2")) %>% - group_by(harp_12m, default_12m, hard = F) %>% - summarise(n = n()) %>% - collect(parallel = T) %>% - group_by(harp_12m, default_12m) %>% - summarise(n = sum(n))) - -# Source: local data table [4 x 3] -# Groups: harp_12m -# -# # A tibble: 4 x 3 -# harp_12m default_12m n -# -# 1 NA NA 1765366501 -# 2 TRUE NA 12343954 -# 3 NA TRUE 22366382 -# 4 TRUE TRUE 638 - -# how many accounts eventually default ------------------------------------ -#future::plan(multiprocess(workers = availableCores())) -pt = proc.time() # 4:26 -simple2 = fmdf_all %>% - srckeep(c("oltv", "default_12m", "harp_12m", "loan_id", "orig_dte", "delq.status")) %>% - #filter(delq.status %in% c("0","X","1","2")) %>% - mutate(orig_yr = substr(orig_dte, 4,7)) %>% - group_by(oltv, loan_id, orig_yr, hard = F) %>% - summarise(ndef = sum(default_12m, na.rm=T), n = n()) %>% - collect(parallel = T) %>% - group_by(oltv, loan_id, orig_yr) %>% - summarise(ndef = sum(ndef), n = sum(n)) -timetaken(pt) - -stopifnot(simple2[,n_distinct(loan_id) == .N]) - -#simple3 = simple2[,.(ndef = sum(ndef > 0), .N), .(orig_yr_band = cut(as.numeric(orig_yr),c(-Inf, 2005, 2007, 2008,2009,Inf)), oltv_band = ceiling(oltv/10)*10)] - -simple3 = simple2[,.(ndef = sum(ndef > 0), .N), .(orig_yr, oltv_band = cut(oltv, c(-Inf, seq(0,80,by=20),Inf)))] - -simple3 %>% - mutate(odr = ndef/N) %>% - select(oltv_band, odr, orig_yr) %>% - spread(key = oltv_band, value = odr) - -simple3 %>% - filter(!is.na(oltv_band)) %>% - mutate(odr = ndef/N) %>% - arrange(oltv_band) %>% - ggplot + - geom_bar( - aes( - x = as.factor(oltv_band), - weight = odr, - colour = orig_yr), - position = "dodge") - - - - - - - diff --git a/tutorials/fannie_mae_new/4c_explore_xgboost.r b/tutorials/fannie_mae_new/4c_explore_xgboost.r deleted file mode 100644 index f6a53cba..00000000 --- a/tutorials/fannie_mae_new/4c_explore_xgboost.r +++ /dev/null @@ -1,44 +0,0 @@ -source("inst/fannie_mae/0_setup.r") -library(disk.frame) - -vars_to_keep = c( - Performance_Variables[1:15] %>% tolower, - Acquisitions_Variables[-1] %>% tolower, - c("default_12m", "harp_12m") -) %>% unique - -# create a new variable dh12m which is a concatenation of default in next 12 months or going into harp next 12 months -fmdf_all = disk.frame(file.path(outpath, "fmdf_appl")) %>% - srckeep(vars_to_keep) %>% - delayed(~{ - .x[,dh12m := default_12m | harp_12m] - .x[is.na(dh12m), dh12m:=F] - .x - }) - -fmdf1 = disk.frame(file.path(outpath, "fmdf_appl")) - -# took 2 mins -system.time(uid <- unique(fmdf1[substr(orig_dte ,4,7) >= "2014",unique(loan_id), keep=c("loan_id","orig_dte")])) - -# set.seed(1) -# suid = sample(uid, length(uid)/100) - -uiddf = data.table(loan_id = uid) -setkey(uiddf, loan_id) - -#fmdf_all = disk.frame(file.path(outpath, "fmdf_appl")) -# took about 10 minutes -system.time(fmdf_2yr <- fmdf_all %>% - map.disk.frame(~{ - setkey(.x, loan_id) - merge(.x, uiddf, by="loan_id") - }, - lazy = F, - outdir = file.path(outpath,"fmdf_2yr"), - overwrite = T)) - -# fmdf_2yr = disk.frame(file.path(outpath,"fmdf_2yr")) -system.time(sfmdf_2yr <- sample_frac(fmdf_2yr, 1) %>% collect(parallel = F)) - - diff --git a/tutorials/fannie_mae_new/4d_xgboost.r b/tutorials/fannie_mae_new/4d_xgboost.r deleted file mode 100644 index 248e676a..00000000 --- a/tutorials/fannie_mae_new/4d_xgboost.r +++ /dev/null @@ -1,151 +0,0 @@ -source("inst/fannie_mae/0_setup.r") -library(disk.frame) -fmdf_2yr = disk.frame(file.path(outpath,"fmdf_2yr")) - -system.time(xy <- fmdf_2yr[,.(dh12m, oltv),keep=c("dh12m","oltv")]) - - -# show how to binning ----------------------------------------------------- -system.time(xy <- fmdf_2yr[,c("dh12m", "oltv"), keep=c("dh12m","oltv")]) - -dtrain <- xgb.DMatrix(label = xy$dh12m, data = as.matrix(xy[,"oltv"])) - -pt = proc.time() -m <- xgboost( - data=dtrain, - nrounds = 1, - objective = "binary:logitraw", - tree_method="exact", - monotone_constraints = 1, - base_score = 32662/10067411) -timetaken(pt) - - -map_chr(xgb.dump(m), ~str_extract(.x,"\\[f0<[\\d]+\\.[\\d]+\\]")) %>% - keep(~!is.na(.x)) %>% - map_dbl(~str_extract(.x, "[\\d]+\\.[\\d]+") %>% as.numeric) %>% - sort %>% - floor -> xy[,"oltv"] - -bb = xy[,sum(dh12m)/.N, .(bins = cut(oltv,c(-Inf,bins,Inf)))] - -setkey(bb, bins) - -barplot(height = bb$V1, names.arg = bb$bins) - -pt = proc.time() -m2 <- xgboost( - data=x, - label = y, - nrounds = 3, - objective = "binary:logitraw", - tree_method="exact", - monotone_constraints = 1, - base_score = 32662/10067411) -timetaken(pt) - - -map_chr(xgb.dump(m2), ~str_extract(.x,"\\[f0<[\\d]+\\.[\\d]+\\]")) %>% - keep(~!is.na(.x)) %>% - map_dbl(~str_extract(.x, "[\\d]+\\.[\\d]+") %>% as.numeric) %>% - sort %>% - floor %>% - unique -> bins - -bb = xy[,sum(dh12m)/.N, .(bins = cut(oltv,c(-Inf,bins,Inf)))] - -setkey(bb, bins) - -barplot(height = bb$V1, names.arg = bb$bins) - -print(length(bins)) - -xgb.plot.tree(model = m) - - -# with penalty ------------------------------------------------------------ -system.time(xy <- fmdf_2yr[,.(dh12m, oltv),keep=c("dh12m","oltv")]) - -dtrain <- xgb.DMatrix(label = xy$dh12m, data = as.matrix(xy$oltv)) - -system.time(xycv <- - xgb.cv(dtrain, objective="binary:logitraw", nfold = 5, nrounds=1)) -xycv - -pt = proc.time() -m <- xgboost( - data=dtrain, - nrounds = 1, - objective = "binary:logitraw", - tree_method="exact", - monotone_constraints = 1, - base_score = 32662/10067411 -) -timetaken(pt) - - -map_chr(xgb.dump(m), ~str_extract(.x,"\\[f0<[\\d]+\\.[\\d]+\\]")) %>% - keep(~!is.na(.x)) %>% - map_dbl(~str_extract(.x, "[\\d]+\\.[\\d]+") %>% as.numeric) %>% - sort %>% - floor -> bins - -bb = xy[,sum(dh12m)/.N, .(bins = cut(oltv,c(-Inf,bins,Inf)))] - -setkey(bb, bins) - -barplot(height = bb$V1, names.arg = bb$bins) - -xgb.plot.tree(model = m) - -# do a variable with NA --------------------------------------------------- -system.time(xy <- fmdf_2yr[,.(dh12m, mi_pct),keep=c("dh12m","mi_pct")]) - -dtrain <- xgb.DMatrix(label = xy$dh12m, data = as.matrix(xy$mi_pct)) - -pt = proc.time() -m <- xgboost( - data= dtrain, - nrounds = 1, - objective = "binary:logitraw", - tree_method="exact", - monotone_constraints = 1, - base_score = 32662/10067411) -timetaken(pt) - - -map_chr(xgb.dump(m), ~str_extract(.x,"\\[f0<[\\d]+\\.[\\d]+\\]")) %>% - keep(~!is.na(.x)) %>% - map_dbl(~str_extract(.x, "[\\d]+[\\.[\\d]+]") %>% as.numeric) %>% - sort %>% - floor %>% - unique -> bins - -bb = xy[,sum(dh12m)/.N, .(bins = cut(mi_pct,c(-Inf,bins,Inf)) %>% addNA)] - -setkey(bb, bins) - -barplot(height = bb$V1, names.arg = bb$bins) - - -# do other stuff ---------------------------------------------------------- - - -# took 142 -a = select_if(as.tibble(sfmdf_2yr), is.numeric) %>% - select(-c(mi_type)) - -system.time(x <- as.matrix(a)) - - -dtrain <- xgb.DMatrix(label = sfmdf_2yr$dh12m, data = x) - - -xgboost( - data=x, - label = sfmdf_2yr$dh12m, - nrounds = 2, - objective = "binary:logitraw", - tree_method="exact") - - diff --git a/tutorials/fannie_mae_new/5a_appl_model.r b/tutorials/fannie_mae_new/5a_appl_model.r deleted file mode 100644 index fb088672..00000000 --- a/tutorials/fannie_mae_new/5a_appl_model.r +++ /dev/null @@ -1,53 +0,0 @@ -source("inst/fannie_mae/0_setup.r") -library(disk.frame) - -defaults = disk.frame(file.path(outpath,"defaults.df")) %>% - srckeep(c("loan_id", "start_date")) %>% - collect - -defaults[,first_default_date := start_date] -month(defaults$first_default_date) <- month(defaults$first_default_date)+12 - -defaults = defaults[,.(loan_id, first_default_date)] - -acqall = disk.frame(file.path(outpath,"acq_all.df")) - -# 1:14 -pt <- proc.time() -acqall1 <- left_join( - acqall, - defaults, - by = "loan_id") %>% - delayed(~{ - library(lubridate) - .x[ - !is.na(first_default_date), - mths_to_1st_default := interval( - as.Date(paste0("01/", frst_dte),"%d/%m/%Y"), - first_default_date) - %/% months(1)] - - .x[,default_next_12m := F] - .x[!is.na(first_default_date), default_next_12m := mths_to_1st_default <= 12] - - res = .x[substr(frst_dte,4,7) < 2016, ] - - res - }) %>% - compute(outdir = file.path(outpath, "appl_mdl_data"), overwrite = T) -timetaken(pt) - -# default rate by year of origination -system.time(drbyyr <- acqall1 %>% - srckeep(c("frst_dte", "default_next_12m")) %>% - mutate(frst_yr = substr(frst_dte, 4, 7)) %>% - group_by(frst_yr, hard = F) %>% - summarise(ndef = sum(default_next_12m, na.rm=T), n = n()) %>% - collect(parallel = T) %>% - group_by(frst_yr) %>% - summarise(ndef = sum(ndef), n = sum(n)) %>% - mutate(odr = ndef/n)) - -drbyyr %>% - ggplot + - geom_line(aes(x = frst_yr %>% as.numeric, y = odr)) diff --git a/tutorials/fannie_mae_new/5b_one_var.r b/tutorials/fannie_mae_new/5b_one_var.r deleted file mode 100644 index 8b796e2e..00000000 --- a/tutorials/fannie_mae_new/5b_one_var.r +++ /dev/null @@ -1,189 +0,0 @@ -source("inst/fannie_mae/0_setup.r") -library(disk.frame) - -acqall1 = disk.frame(file.path(outpath, "appl_mdl_data")) - -# XGBoost on one var ------------------------------------------------------ -system.time(xy <- acqall1[,c("default_next_12m", "oltv"), keep=c("default_next_12m", "oltv")]) - -# transform into format accepted by XGBoost -dtrain <- xgb.DMatrix(label = xy$default_next_12m, data = as.matrix(xy[,"oltv"])) - -portfolio_default_rate = xy[,sum(default_next_12m, na.rm = T)/.N] - -pt = proc.time() -m <- xgboost( - data=dtrain, - nrounds = 1, - objective = "binary:logitraw", - tree_method="exact", - monotone_constraints = 1, - base_score = portfolio_default_rate) -timetaken(pt) - -prev_pred = predict(m, dtrain) - -map_chr(xgb.dump(m), ~str_extract(.x,"\\[f0<[\\d]+\\.[\\d]+\\]")) %>% - keep(~!is.na(.x)) %>% - map_dbl(~str_extract(.x, "[\\d]+\\.[\\d]+") %>% as.numeric) %>% - sort %>% - floor -> bins - -bb = xy[,.(ndef = sum(default_next_12m), .N, m1 = min(oltv), m2 = max(oltv)), .(bins = cut(oltv,c(-Inf,bins,Inf)))] -new_bins = sort(unique( bb$m2)) -bb = xy[,.(ndef = sum(default_next_12m), .N), .(bins = cut(oltv,c(-Inf,new_bins,Inf)))][order(bins)] - -setkey(bb, bins) -bb %>% - filter(!is.na(bins)) %>% - mutate(`Orig LTV Band` = bins, `Default Rate%` = ndef/N) %>% - ggplot + - geom_bar(aes(x = `Orig LTV Band`, y = `Default Rate%`), stat = 'identity') + - coord_flip() - -if(F) { - system.time(xyz <- acqall1[,c("default_next_12m", "oltv", "frst_dte"), keep=c("default_next_12m", "oltv", "frst_dte")]) - - xyz[,frst_yr := substr(frst_dte,4,7) %>% as.integer] - xyz[,frst_dte:=NULL] - - bb = xyz[,.(ndef = sum(default_next_12m), .N), .(bins = cut(oltv,c(-Inf,bins,Inf)), frst_yr)] - - bb[,binton := sum(N), bins] - bb[,dr := ndef/binton] - - setkey(bb, bins) - bb[order(frst_yr,decreasing = T),text_y_pos := cumsum(dr) - dr/2, bins] - bb[,text := substr(frst_yr, 3,4)] - - bb %>% - filter(!is.na(bins)) %>% - mutate(`Yr of Orig` = as.factor(frst_yr), `Orig LTV Band` = bins, `Default Rate%` = dr) %>% - ggplot + - geom_bar(aes(x = `Orig LTV Band`, y = `Default Rate%`, fill = `Yr of Orig`), stat = 'identity') + - geom_text(aes(x = `Orig LTV Band`, y = text_y_pos, label = text)) + - coord_flip() -} - - -# what if we fit them in rounded number --------------------------------------------------------- - -xy[oltv > 80, oltv_round := ceiling(oltv/5)*5] -xy[oltv <= 80, oltv_round := ceiling(oltv/10)*10] -xy[oltv <= 40, oltv_round := ceiling(oltv/20)*20] -#xy = xy[!is.na(oltv_round),] - -xy[is.na(oltv_round),] - -xy[,.N, oltv_round] - -# transform into format accepted by XGBoost -dtrain <- xgb.DMatrix(label = xy$default_next_12m, data = as.matrix(xy[,"oltv_round"])) - -pt = proc.time() -m <- xgboost( - data=dtrain, - nrounds = 1, - objective = "binary:logitraw", - tree_method="exact", - monotone_constraints = 1, - base_score = portfolio_default_rate) -timetaken(pt) - -map_chr(xgb.dump(m), ~str_extract(.x,"\\[f0<[\\d]+[\\.]{0,1}[\\d]+\\]")) %>% - keep(~!is.na(.x)) %>% - map_dbl(~str_extract(.x, "[\\d]+[\\.]{0,1}[\\d]+") %>% as.numeric) %>% - sort -> bins - -bb = xy[,.(ndef = sum(default_next_12m), .N, m1 = min(oltv_round), m2 = max(oltv_round)), .(bins = cut(oltv_round,c(-Inf,bins,Inf)))] -new_bins = sort(unique( bb$m2)) -bb = xy[,.(ndef = sum(default_next_12m), .N), .(bins = cut(oltv_round,c(-Inf,new_bins,Inf)))][order(bins)] -bb[,odr := ndef/N] - -setkey(bb, bins) -bb %>% - filter(!is.na(bins)) %>% - mutate(`Orig LTV Band` = bins, `Default Rate%` = ndef/N) %>% - ggplot + - geom_bar(aes(x = `Orig LTV Band`, y = `Default Rate%`), stat = 'identity') + - coord_flip() - - -prev_pred = predict(m, dtrain) - -prev_pred1 = predict(m, dtrain, predcontrib=T) - -# add another variable ---------------------------------------------------- -target = "default_next_12m" -feature = "orig_amt" -df = acqall1 -format_fn = base::I -existing_model = prev_pred -monotone_constraints = -1 - -auc <- function(target, score) { - df = data.table(target, score) - - df1 = df[,.(nt = sum(target), n = .N, score)] - setkey(df1, score) -} - -add_var_to_scorecard <- function(df, target, feature, monotone_constraints = 0, prev_pred = NULL, format_fn = base::I) { - - xy = df %>% - srckeep(c(target, feature)) %>% - collect(parallel = T) - - # evaluate - code = glue::glue("xy = xy %>% mutate({feature} = format_fn({feature}))") - eval(parse(text = code)) - - dtrain <- xgb.DMatrix(label = xy[,target, with = F][[1]], data = as.matrix(xy[,c(feature), with = F])) - - if(is.null(prev_pred)) { - pt = proc.time() - m2 <- xgboost( - data=dtrain, - nrounds = 1, - objective = "binary:logitraw", - tree_method="exact", - monotone_constraints = monotone_constraints - ) - timetaken(pt) - } else { - setinfo(dtrain, "base_margin", prev_pred) - pt = proc.time() - m2 <- xgboost( - data=dtrain, - nrounds = 1, - objective = "binary:logitraw", - tree_method="exact", - monotone_constraints = monotone_constraints - ) - timetaken(pt) - - a2 = predict(m2, dtrain) - a3 = predict(m2, dtrain, predcontrib = T) - } - - map_chr(xgb.dump(m2), ~str_extract(.x,"\\[f0<[\\d]+[\\.]{0,1}[\\d]+\\]")) %>% - keep(~!is.na(.x)) %>% - map_dbl(~str_extract(.x, "[\\d]+[\\.]{0,1}[\\d]+") %>% as.numeric) %>% - sort -> bins - - code = glue::glue("bb = xy[,.(ndef = sum({target}), .N, m1 = min({feature}), m2 = max({feature})), .(bins = cut({feature},c(-Inf,bins,Inf)))]") - eval(parse(text = code)) - - new_bins = sort(unique(bb$m2)) - code1 = glue::glue("bb = xy[,.(ndef = sum(default_next_12m), .N), .(bins = cut({feature},c(-Inf,new_bins,Inf)))][order(bins)]") - eval(parse(text = code1)) - - setkey(bb, bins) - bb %>% - filter(!is.na(bins)) %>% - mutate(`Orig LTV Band` = bins, `Default Rate%` = ndef/N) %>% - ggplot + - geom_bar(aes(x = `Orig LTV Band`, y = `Default Rate%`), stat = 'identity') + - coord_flip() -} - diff --git a/tutorials/fannie_mae_new/5c_using_a_fn.r b/tutorials/fannie_mae_new/5c_using_a_fn.r deleted file mode 100644 index e083fa54..00000000 --- a/tutorials/fannie_mae_new/5c_using_a_fn.r +++ /dev/null @@ -1,54 +0,0 @@ -source("inst/fannie_mae/0_setup.r") -library(disk.frame) - -acqall1 = disk.frame(file.path(outpath, "appl_mdl_data")) - -pt <- proc.time() -firstm <- add_var_to_scorecard(acqall1, "default_next_12m", "oltv", monotone_constraints = 1, format_fn = function(v) { - ceiling(v / 5) * 5 -}) -timetaken(pt) - -# now add the second variable --------------------------------------------- -pt <- proc.time() -secondm <- add_var_to_scorecard( - acqall1, - "default_next_12m", - "dti", - prev_pred = firstm$prev_pred, - monotone_constraints = 1) -timetaken(pt) - -pt <- proc.time() -thirdm <- add_var_to_scorecard( - acqall1, - "default_next_12m", - "orig_trm", - prev_pred = secondm$prev_pred, - monotone_constraints = 1) -timetaken(pt) - -pt <- proc.time() -fourthm <- add_var_to_scorecard( - acqall1, - "default_next_12m", - "num_bo", - prev_pred = thirdm$prev_pred, - monotone_constraints = -1, - format_fn = as.numeric) -timetaken(pt) - -pt <- proc.time() -fifthm <- add_var_to_scorecard( - acqall1, - "default_next_12m", - "cscore_b", - prev_pred = fourthm$prev_pred, - monotone_constraints = -1) -timetaken(pt) - - -# plot improvement -------------------------------------------------------- -plot(map_dbl(list(firstm, secondm, thirdm, fourthm, fifthm), ~.x$auc), type="b", ylim = c(0.4,0.9)) -abline(h = 0.7, lty = 2) - diff --git a/tutorials/fannie_mae_new/5d_AutoML.r b/tutorials/fannie_mae_new/5d_AutoML.r deleted file mode 100644 index 7525c879..00000000 --- a/tutorials/fannie_mae_new/5d_AutoML.r +++ /dev/null @@ -1,34 +0,0 @@ -source("inst/fannie_mae/0_setup.r") -library(disk.frame) -acqall = disk.frame(file.path(outpath, "appl_mdl_data")) - -pt = proc.time() -acqall_all = acqall %>% - map.disk.frame(~{ - defs = .x[default_next_12m == T,] - defs[,weight:=1] - non_defs = .x[default_next_12m == F, ] - non_defs[,weight:=10] - - rbindlist(list(defs, sample_frac(non_defs, 0.1)), fill = T, use.names = T) - }, lazy = F, outdir = file.path(outpath, "appl_mdl_data_sampled"), overwrite = T) -timetaken(pt) - -#acqall_all = disk.frame(file.path(outpath, "appl_mdl_data_sampled")) - -acqall_dev = sample_frac(acqall_all, 0.7) %>% - write_disk.frame(file.path(outpath, "appl_mdl_data_sampled_dev")) - -#acqall_dev = disk.frame(file.path(outpath, "appl_mdl_data_sampled_dev")) - - -uid_dev = acqall_dev %>% - srckeep("loan_id") %>% - mutate(loan_id = unique(loan_id)) %>% - collect(parallel = T) %>% - mutate(loan_id = unique(loan_id)) - - -acqall_val = acqall_all %>% - anti_join(uid_dev, by = "loan_id") %>% - write_disk.frame(file.path(outpath, "appl_mdl_data_sampled_val")) diff --git a/tutorials/fannie_mae_new/5e_AutoML_actual.r b/tutorials/fannie_mae_new/5e_AutoML_actual.r deleted file mode 100644 index 9e7db972..00000000 --- a/tutorials/fannie_mae_new/5e_AutoML_actual.r +++ /dev/null @@ -1,90 +0,0 @@ -source("inst/fannie_mae/0_setup.r") -library(disk.frame) - -acqall_dev = disk.frame(file.path(outpath, "appl_mdl_data_sampled_dev2")) - - -# 5d develop a function to test all variables -check_which_is_best <- function(df, target, features, monotone_constraints, format_fns, weight=NULL) { - #browser() - prev_pred = NULL - ws = NULL - vars_scr = NULL - - while(length(features) > 0) { - # try every var - #browser() - res = furrr::future_map(1:length(features), ~add_var_to_scorecard( - #res = purrr::map(1:length(features), ~add_var_to_scorecard( - df, - target, - features[.x], - monotone_constraints = monotone_constraints[.x], - prev_pred = prev_pred, - format_fn = format_fns[[.x]], - weight, - save_model_fname = glue::glue("{features[.x]}.xgbm") - )) - - # which has the best auc - w = which.max(map_dbl(res, ~.x$auc)) - ws = c(ws, w) - feature_to_fit = features[w] - print(glue::glue("choosen: {feature_to_fit}")) - - var_scr1 = res[[w]] - eval(parse(text = glue::glue("vars_scr = c(vars_scr, list({feature_to_fit} = var_scr1))"))) - - features = features[-w] - format_fns = format_fns[-w] - monotone_constraints = monotone_constraints[-w] - - prev_pred = var_scr1$prev_pred - } - vars_scr -} - -num_vars = c("mi_pct", "orig_amt", "orig_rt", "ocltv", "mi_type", "cscore_c", "oltv", "dti", "orig_trm", "cscore_b", "num_bo", "num_unit", "orig_dte", "frst_dte") -num_vars_mon = c(0 , 1 , 1 , 1 , 0 , -1 , 1 , 1 , 1 , -1 , -1 , 0 , 0 , 0) -num_var_fmt_fn = c(map(1:10, ~base::I), map(1:2, ~as.numeric), map(1:2, ~function(x) { - as.numeric(substr(x, 4, 7)) -})) - -cat_vars = setdiff(names(acqall_dev), num_vars) %>% - setdiff( - c("loan_id", "default_next_12m", "first_default_date", "mths_to_1st_default", "zip_3", "weight")) - -pt = proc.time() -res_all = check_which_is_best( - acqall_dev, - "default_next_12m", - features = c(num_vars, cat_vars), - monotone_constraints = c(num_vars_mon, rep(0, length(cat_vars))), - format_fns = c(num_var_fmt_fn, map(1:length(cat_vars), ~base::I)), - weight = "weight" -) -timetaken(pt) - -plot(map_dbl(res_all, ~.x$auc)) - -saveRDS(res_all, "model.rds") - -if(F) { - rescat = check_which_is_best( - acqall_dev, - "default_next_12m", - c("seller.name", "oltv", "state", "dti", "mi_type"), - c(0 , 1 , 0 , 1 , 0 ), - map(1:5, ~base::I) - ) - - - system.time(res <- check_which_is_best( - acqall1, - "default_next_12m", - c("ocltv", "cscore_c", "mi_type", "oltv", "dti", "orig_trm", "cscore_b", "num_bo"), - c(1 , -1 , 0 , 1 , 1 , 1 , -1 , -1 ), - c(map(1:7, ~base::I), list(as.numeric)))) -} - - diff --git a/tutorials/fannie_mae_new/5f_make_scorecard_visible.R b/tutorials/fannie_mae_new/5f_make_scorecard_visible.R deleted file mode 100644 index 2d696a29..00000000 --- a/tutorials/fannie_mae_new/5f_make_scorecard_visible.R +++ /dev/null @@ -1,94 +0,0 @@ -source("inst/fannie_mae/0_setup.r") -library(disk.frame) - -acqall_val = disk.frame(file.path(outpath, "appl_mdl_data_sampled_val")) - -#the model -mdl = readRDS("model.rds") - -plot(purrr::map_dbl(mdl, ~.x$auc)) - -#' score one variable -score_one_var <- function(feature, x, bins, bias, format_fn = base::I) { - print(feature) - # numeric features (post format_fn) has only two columns in bins - if(ncol(bins) == 2) { - x1 = format_fn(x) - binsNA = evalparseglue("bins[is.na({feature}),]") - bins = evalparseglue("bins[!is.na({feature}),]") - score = bins$score[cut(x1, c(-Inf, bins[[2]], Inf))] - - if(nrow(binsNA) == 1) { - score[is.na(x1)] = binsNA[[1]] - } - } else { - x1 = format_fn(x) - x1df = evalparseglue("data.table({feature} = x1)") - x1df_scr = evalparseglue("merge(x1df, bins, by = '{feature}', all.x=T)") - score = x1df_scr$score - } - # if there are still na then assign the bias - #browser(expr = any(is.na(score))) - if(any(is.na(score))) { - warning(glue::glue("the feature `{feature}` has {sum(is.na(score))} unassigned scores out of {length(score)}, assigning them the bias = {bias}")) - score[is.na(score)] = bias - } - - score -} - - -#' Socre a scorecard -#' @param mdl the xg scorecard model -#' @param df a disk.frame -score_xg_scorecard <- function(df, mdl) { - res = furrr::future_map_dfc(mdl, ~{ - #res = purrr::map_dfc(mdl, ~{ - var = df %>% - srckeep(.x$feature) %>% - collect(parallel = F) - x = var[[1]] - feature = .x$feature - bins = .x$bins - bias = .x$bias - format_fn = .x$format_fn - score_one_var(feature, x, bins, bias, format_fn) - }) %>% rowSums - res -} - -plot_auc <- function(df) { - target = df %>% - srckeep("default_next_12m") %>% - collect(parallel = F) - - pt = proc.time() - target$score = score_xg_scorecard(df, mdl) - timetaken(pt) - - AUC = auc(target$default_next_12m, target$score) - GINI = 2*AUC-1 - - target[,negscore := -score] - setkey(target, negscore) - cts = target[,.(ctot = (1:.N)/.N, cbad = cumsum(default_next_12m)/sum(default_next_12m))] - - plot(cts[seq(1,.N, length.out=100),], type="l", xlim=c(0,1), - main=glue::glue("On Validation - AUC: {AUC %>% round(2)}; GINI: {GINI %>% round(2)}")) - abline(a=0,b=1,lty=2) - abline(h=0) - abline(v=0) -} - -system.time(plot_auc(acqall_val)) - -# score on whole ---------------------------------------------------------- -acqall_dev = disk.frame(file.path(outpath, "appl_mdl_data_sampled_dev")) -system.time(plot_auc(acqall_dev)) - - -# score on whole ---------------------------------------------------------- -acqall = disk.frame(file.path(outpath, "appl_mdl_data")) -system.time(plot_auc(acqall)) - - diff --git a/tutorials/fannie_mae_new/6a_origination_year.r b/tutorials/fannie_mae_new/6a_origination_year.r deleted file mode 100644 index 8115a782..00000000 --- a/tutorials/fannie_mae_new/6a_origination_year.r +++ /dev/null @@ -1,31 +0,0 @@ - -acqall2 = acqall1 %>% - srckeep(c("orig_dte", "first_default_date")) %>% - mutate(orig_yr = substr(orig_dte, 4,7), yr_1st_d = year(first_default_date)) %>% - group_by(orig_yr, yr_1st_d, hard = F) %>% - summarise(n=n()) %>% - collect %>% - group_by(orig_yr, yr_1st_d) %>% - summarise(n = sum(n)) - -acqall2[,tot_n := sum(n), orig_yr] - -acqall3 <- acqall2[!is.na(yr_1st_d),] - -acqall3[,dr := n/tot_n] - -acqall3 %>% - filter(orig_yr > 1999) %>% - mutate(`Origination Year` = orig_yr) %>% - ggplot + - geom_line(aes(x = yr_1st_d, y = dr, colour = `Origination Year`)) + - xlab("Year of Observation") + - ylab("Ratio of defaulted accounts vs # of accts at orig") + - scale_x_continuous(breaks=2000:2017, labels=as.character(2000:2017)) + - scale_y_continuous(expand = c(0, 0)) + - ggtitle("Fannie Mae Single Family Loans: Ratio of defaults vs # of accounts in same year of origination") - - - - - diff --git a/tutorials/fannie_mae_new/7a_speedglm.r b/tutorials/fannie_mae_new/7a_speedglm.r deleted file mode 100644 index 3c3fe0c6..00000000 --- a/tutorials/fannie_mae_new/7a_speedglm.r +++ /dev/null @@ -1,75 +0,0 @@ -source("inst/fannie_mae/0_setup.r") -library(disk.frame) - -acqall_dev = disk.frame(file.path(outpath, "appl_mdl_data_sampled_dev2")) - -library(speedglm) - -head(acqall_dev) - -#' A streaming function for speedglm -#' @param df a disk.frame -stream_shglm <- function(df) { - i = 0 - is = sample(nchunks(df), replace = F) - function(reset = F) { - #browser() - if(reset) { - print("you've reset") - i <<- 0 - } else { - i <<- i + 1 - #if (i > 4) { - if (i > nchunks(df)) { - return(NULL) - } - print(glue::glue("{i}/{nchunks(df)}")) - return(get_chunk(df, is[i])) - #return(get_chunk(df, i)) - } - } -} - -acqall_dev1 = acqall_dev %>% delayed(~{ - .x[,oltv_band := cut(oltv, c(-Inf, seq(0,100,by=10)))] - .x[,dti_band := addNA(cut(dti, c(-Inf, seq(0,100,by=10), Inf)))] - .x -}) - -library(speedglm) -library(biglm) - -shglm(default_next_12m ~ oltv_band-1, - datafun = streamacq, family=binomial()) - -bigglm(default_next_12m ~ oltv_band-1, data = streamacq, family=binomial()) - -make.data<-function(filename, chunksize,...){ - conn<-NULL - function(reset=FALSE){ - if(reset){ - if(!is.null(conn)) - close(conn) - conn<<-file(filename,open="r")} else{ - rval<-read.table(conn, nrows=chunksize,...) - if ((nrow(rval)==0)) { - close(conn) - conn<<-NULL - rval<-NULL} - return(rval)}}} -# data1 is a small toy dataset -data(data1) -write.table(data1,"data1.txt",row.names=FALSE,col.names=FALSE) -rm(data1) -da<-make.data("data1.txt",chunksize=50,col.names=c("y","fat1","x1","x2")) -# Caution! make sure to close the connection once you have run command #1 -da(reset=T) #1: opens the connection to "data1.txt" -da(reset=F) #2: reads the first 50 rows (out of 100) of the dataset -da(reset=F) #3: reads the second 50 rows (out of 100) of the dataset -da(reset=F) #4: is NULL: this latter command closes the connectionrequire(biglm)# fat1 is a factor with four levels - -b1<-shglm(y~factor(fat1)+x1,weights=~I(x2^2),datafun=da,family=Gamma(log)) -b2<-bigglm(y~factor(fat1)+x1,weights=~I(x2^2),data=da,family=Gamma(log)) -summary(b1) -summary(b2) - diff --git a/tutorials/fannie_mae_new/8a_keras.r b/tutorials/fannie_mae_new/8a_keras.r deleted file mode 100644 index 1fc7dcda..00000000 --- a/tutorials/fannie_mae_new/8a_keras.r +++ /dev/null @@ -1,153 +0,0 @@ -source("inst/fannie_mae/0_setup.r") -library(disk.frame) -library(keras) - -acqall_dev = disk.frame(file.path(outpath, "appl_mdl_data_sampled_dev")) - -#' A streaming function for speedglm -#' @param df a disk.frame -stream_shglm <- function(df) { - i = 0 - is = sample(nchunks(df), replace = F) - function(reset = F) { - #browser() - if(reset) { - print("you've reset") - i <<- 0 - } else { - i <<- i + 1 - #if (i > 4) { - if (i > nchunks(df)) { - return(NULL) - } - print(glue::glue("{i}/{nchunks(df)}")) - return(get_chunk(df, is[i])) - #return(get_chunk(df, i)) - } - } -} - -acqall_dev1 = acqall_dev %>% delayed(~{ - .x[,oltv_band := cut(oltv, c(-Inf, seq(40,100,by=20)))] - .x[,dti_band := addNA(cut(dti, c(-Inf, seq(0,64,by=8))))] - .x -}) - - -build_model <- function() { - model <- keras_model_sequential() %>% - layer_dense(units = 2, activation = 'softmax') - - - model %>% compile( - loss = "categorical_crossentropy", - optimizer = 'sgd' - ) - - model -} - -model = build_model() - -ol = levels(acqall_dev1[,oltv_band, keep=c("oltv","dti")]) -dl = levels(acqall_dev1[,dti_band, keep=c("oltv","dti")]) - -#for(i in 1:nchunks(acqall_dev1)) { -kk <- function() { - j = 0 - done = F - ii = 0 - while(!done) { - j <- j + 1 - si = sample(nchunks(acqall_dev1), nchunks(acqall_dev1)*0.7) - osi = setdiff(1:nchunks(acqall_dev1), si) - #browser() - system.time(a <- map_dfr(si, ~{ - ii <- ii + 1 - i = .x - if(ii %% 20 == 0) print(glue::glue("{j}:{ii} {Sys.time()}")) - - a = get_chunk(acqall_dev1, i) - a[,.(sum(default_next_12m), .N),oltv_band] - - a1 = - cbind( - a[,keras::to_categorical(as.integer(oltv_band) - 1)]#, - #a[,keras::to_categorical(as.integer(dti_band) - 1)] - ) - - at = a[,default_next_12m*1] - Y_train = keras::to_categorical(at) - - hist = model %>% fit( - a1, - Y_train, - epochs = 1, - validation_split = 0.2, - verbose = 0 - ) - - gw = get_weights(model) - - gwdt = as.data.table(gw[1]) - setnames(gwdt, names(gwdt), c("non_default", "default")) - gwdt$i =i - gwdt$var = c( - rep("oltv_band", length(ol)) - #, rep("dti_band", length(dl)+1) - ) - gwdt$band = c(ol#,dl - ) - gwdt - })) - - some_chunks <- map(osi, ~{ - i = .x - a = get_chunk(acqall_dev1, i) - - a1 = - cbind( - a[,keras::to_categorical(as.integer(oltv_band) -1)] - #,a[,keras::to_categorical(dti_band %>% as.integer-1)] - ) - a1 - }) %>% reduce(rbind) - - outcomes <- map(osi, ~{ - i = .x - a = get_chunk(acqall_dev, i, keep=c("default_next_12m")) - a[,default_next_12m] - }) %>% unlist - - auc= auc(outcomes, predict(model, some_chunks)[,2]) - - # scores - p = predict(model, diag(4))[,2] - a_b = log(p/(1-p)) - - scalar = 20/log(2) - intercept = 20*log(536870912/15)/log(2) - - a = mean(a_b) - b = a_b - a - done <- length(unique(sign(diff(b)))) == 1 - - scrs = data.table(base_scr = round(a*scalar+intercept,0), scr = round(-b*scalar,0)) - scrs$var = c(rep("oltv_band", length(ol)) - #, rep("dti_band", length(dl)+1) - ) - scrs$band = c(ol - #, dl - ) - scrs = scrs[band != "bias"] - - scrs = scrs[,.(var, band, scr, base_scr)] - print(glue::glue("AUC: {auc}")) - print(scrs) - } -} - -pt = proc.time() -kk() -timetaken(pt) -View(scrs) diff --git a/tutorials/flights-100-1000.r b/tutorials/flights-100-1000.r deleted file mode 100644 index da2f528f..00000000 --- a/tutorials/flights-100-1000.r +++ /dev/null @@ -1,49 +0,0 @@ -library(data.table) -library(disk.frame) -setup_disk.frame() - -bench_disk.frame_data.table_group_by <- function(data1,n) { - a.sharded.df = as.disk.frame(data1, shardby = c("year", "month", "day")) - - a.not_sharded.df = as.disk.frame(data1) - - a = copy(data1) - setDT(a) - - data.table_timing = system.time(a[,.(mean_dep_time = mean(dep_time, na.rm=T)), .(year, month, day)])[3] - - disk.frame_sharded_timing = system.time( - a.sharded.df[ - , - .(mean_dep_time = mean(dep_time, na.rm=TRUE)), - .(year, month, day), - keep = c("year", "month","day", "dep_time")])[3] - - - disk.frame_not_sharded_timing = system.time( - a.not_sharded.df[ - , - .( - sum_dep_time = sum(dep_time, na.rm=TRUE), - n = sum(!is.na(dep_time)) - ), - .(year, month, day), - keep = c("year", "month","day", "dep_time")][ - , - .(mean_dep_time = sum(sum_dep_time)/sum(n)), - .(year, month, day) - ])[3] - - barplot( - c(data.table_timing, disk.frame_sharded_timing, disk.frame_not_sharded_timing), - names.arg = c("data.table", "sharded disk.frame", "not sharded disk.frame"), - main = glue:glue("flights duplicated {n} times group-by year, month, day"), - ylab = "Seconds") -} - -system.time(flights_100 <- rbindlist(lapply(1:100, function(x) nycflights13::flights))) - -system.time(flights_1000 <- rbindlist(lapply(1:10, function(x) flights_100))) - -bench_disk.frame_data.table_group_by(flights_100, 100) -bench_disk.frame_data.table_group_by(flights_1000, 1000) diff --git a/tutorials/flights_case_study/0-download-data.sh b/tutorials/flights_case_study/0-download-data.sh deleted file mode 100644 index ba7b1350..00000000 --- a/tutorials/flights_case_study/0-download-data.sh +++ /dev/null @@ -1,7 +0,0 @@ -#!/bin/bash - -cd inst/flights_case_study/data/ - -for i in {1987..2008}; do wget -q http://stat-computing.org/dataexpo/2009/"$i".csv.bz2; done - -for i in {1987..2008}; do bunzip2 "$i".csv.bz2; done diff --git a/tutorials/flights_case_study/1-make-disk-frame.r b/tutorials/flights_case_study/1-make-disk-frame.r deleted file mode 100644 index 19e182b0..00000000 --- a/tutorials/flights_case_study/1-make-disk-frame.r +++ /dev/null @@ -1,39 +0,0 @@ - library(disk.frame) - - pt = proc.time() - non_shard = 2007:2008 %>% - purrr::map(~ csv_to_disk.frame(paste0("inst/flights_case_study/data/",.x,".csv"), - outdir=paste0("tmp",.x), - overwrite=T)) %>% - rbindlist.disk.frame( - outdir = "inst/data/flights_all_no_shard.df", by_chunk_id = T) - - - sharded = 2007:2008 %>% - purrr::map(~ csv_to_disk.frame(paste0("inst/flights_case_study/data/",.x,".csv"), - outdir=paste0("tmp2",.x), - shardby=c("Year","Month"), - overwrite=T)) %>% - rbindlist.disk.frame( - outdir = "inst/data/flights_all_shard.df", by_chunk_id = T) - - - - nrow(non_shard) - nrow(sharded) - - library(disk.frame) - sharded = disk.frame("inst/data/flights_all_shard.df") - a = sharded[,.N, UniqueCarrier, keep="UniqueCarrier"] - - a = a[,sum(N), UniqueCarrier] - - nosharded = disk.frame("inst/data/flights_all_no_shard.df/") - a1 = nosharded[,.N, UniqueCarrier, keep="UniqueCarrier"] - a1 = a1[,sum(N), UniqueCarrier] - - setkey(a, UniqueCarrier) - setkey(a1, UniqueCarrier) - - identical(a, a1) - diff --git a/tutorials/flights_case_study/2-shard-check.r b/tutorials/flights_case_study/2-shard-check.r deleted file mode 100644 index 5ae36ddc..00000000 --- a/tutorials/flights_case_study/2-shard-check.r +++ /dev/null @@ -1,46 +0,0 @@ -library(disk.frame) -plan(multiprocess) -library(magrittr) - -a = list.files("inst/flights_case_study/data", - pattern="*.csv", - full.names = T) %>% - head(10) %>% # only 10 files so that we can test faster - purrr::map(~ csv_to_disk.frame(.x, - outdir=paste0("tmp", readr::parse_number(.x)), - shardby = c("Year","Month"), - overwrite=T)) %>% - rbindlist.disk.frame(outdir = "flights_all_sharded.df", by_chunk_id = T) - - -b = list.files("inst/flights_case_study/data", - pattern="*.csv", - full.names = T) %>% - head(10) %>% # only 10 files so that we can test faster - purrr::map(~ csv_to_disk.frame(.x, - outdir=paste0("tmp", readr::parse_number(.x)), - nchunks = 128, - overwrite=T)) %>% - rbindlist.disk.frame(outdir = "flights_all_notsharded.df", by_chunk_id = T) - - -aa = a[,.N, UniqueCarrier, keep=c("UniqueCarrier")][,sum(N), UniqueCarrier][order(UniqueCarrier)] - -bb = b[,.N, UniqueCarrier, keep=c("UniqueCarrier")][,sum(N), UniqueCarrier][order(UniqueCarrier)] - -identical(aa,bb) - -a2 = a %>% - group_by(UniqueCarrier) %>% - summarise(n=n()) %>% - collect %>% - group_by(UniqueCarrier) %>% - summarise(n=sum(n)) %>% - order_by(UniqueCarrier) - -b2 = b %>% group_by(UniqueCarrier) %>% - summarise(n=count()) %>% - collect %>% - group_by(UniqueCarrier) %>% - summarise(n=sum(n)) - diff --git a/tutorials/flights_case_study/2-shard-check2.r b/tutorials/flights_case_study/2-shard-check2.r deleted file mode 100644 index b4b97162..00000000 --- a/tutorials/flights_case_study/2-shard-check2.r +++ /dev/null @@ -1,31 +0,0 @@ -library(disk.frame) -plan(multiprocess) -library(magrittr) - -a = disk.frame("flights_all_sharded.df") -b = disk.frame("flights_all_notsharded.df") - -a2 = a %>% - group_by(UniqueCarrier) %>% - summarise(n=n()) %>% - collect %>% - group_by(UniqueCarrier) %>% - summarise(n=sum(n)) %>% - arrange(UniqueCarrier) - -b2 = b %>% - group_by(UniqueCarrier) %>% - summarise(n=n()) %>% - collect %>% - group_by(UniqueCarrier) %>% - summarise(n=sum(n)) %>% - arrange(UniqueCarrier) - -identical(a2, b2) - -fa = list.files("inst/flights_case_study/data", - pattern="*.csv", - full.names = T) %>% - head(10) - -ab = csv_to_disk.frame(fa, outdir ="pls_del_tmp", overwrite = T) diff --git a/tutorials/flights_case_study/testing.r b/tutorials/flights_case_study/testing.r deleted file mode 100644 index b47cdee6..00000000 --- a/tutorials/flights_case_study/testing.r +++ /dev/null @@ -1,67 +0,0 @@ -library(disk.frame) -library(furrr) -library(magrittr) -plan(multiprocess) - -# fs::dir_delete("tmp22007") -# fs::dir_delete("tmp22008") -# fs::dir_delete("tmp32007") -# fs::dir_delete("tmp32008") -# fs::dir_delete("tmp2007") -# fs::dir_delete("tmp2008") - -pt = proc.time() -full = 1987:2008 %>% - furrr::future_map(~ csv_to_disk.frame(paste0("inst/flights_case_study/data/",.x,".csv"), - outdir=paste0("tmp3",.x), - overwrite=T)) %>% - rbindlist.disk.frame( - outdir = "inst/data/flights_all_no_shard.df", by_chunk_id = T) -print(timetaken(pt)) - -cnames = names(full) - -pt = proc.time() -non_shard = 1987:2008 %>% - furrr::future_map(~ csv_to_disk.frame(paste0("inst/flights_case_study/data/",.x,".csv"), - outdir=paste0("tmp",.x), - in_chunk_size = 3e6, - overwrite=T)) %>% - rbindlist.disk.frame( - outdir = "inst/data/flights_all_no_shard.df", by_chunk_id = T) -print(timetaken(pt)) - -pt = proc.time() -sharded = 1987:2008 %>% - purrr::map(~ csv_to_disk.frame(paste0("inst/flights_case_study/data/",.x,".csv"), - outdir=paste0("tmp2",.x), - shardby=c("Year","Month"), - col.names = cnames, - in_chunk_size = 3e6, - overwrite=T)) %>% - rbindlist.disk.frame( - outdir = "inst/data/flights_all_shard.df", by_chunk_id = T) -print(timetaken(pt)) - -nrow(full) -nrow(non_shard) -nrow(sharded) - -library(disk.frame) -# sharded = disk.frame("inst/data/flights_all_shard.df") -a = sharded[,.N, UniqueCarrier, keep="UniqueCarrier"] - -a = a[,sum(N), UniqueCarrier] - -# non_shard = disk.frame("inst/data/flights_all_no_shard.df/") -a1 = non_shard[,.N, UniqueCarrier, keep="UniqueCarrier"] -a1 = a1[,sum(N), UniqueCarrier] - -setkey(a, UniqueCarrier) -setkey(a1, UniqueCarrier) - -identical(a, a1) - -a2 = merge(a,a1, by="UniqueCarrier") - -a2[,all(V1.x == V1.y)] diff --git a/tutorials/flights_case_study/testing2.r b/tutorials/flights_case_study/testing2.r deleted file mode 100644 index d12f0aeb..00000000 --- a/tutorials/flights_case_study/testing2.r +++ /dev/null @@ -1,31 +0,0 @@ -#install.packages("tidyverse") -library(readr) -library(disk.frame) -library(data.table) -library(future) -library(magrittr) -plan(multiprocess) - -pt=proc.time() -a = list.files("inst/flights_case_study/data/", - pattern="*.csv", - full.names = T) %>% - head(10) %>% # only 10 files so that we can test faster - purrr::map(~ csv_to_disk.frame(.x, - outdir=paste0("tmp", readr::parse_number(.x)), - nchunks = 128, - overwrite=T)) %>% - rbindlist.disk.frame(outdir = "flights_all_notsharded.df", by_chunk_id = T) -print(timetaken(pt)) - -pt=proc.time() -b = list.files("inst/flights_case_study/data/", - pattern="*.csv", - full.names = T) %>% - head(10) %>% # only 10 files so that we can test faster - purrr::map(~ csv_to_disk.frame(.x, - outdir=paste0("tmp", readr::parse_number(.x)), - shardby = c("Year","Month"), - overwrite=T)) %>% - rbindlist.disk.frame(outdir = "flights_all_sharded.df", by_chunk_id = T) -print(timetaken(pt)) \ No newline at end of file diff --git a/tutorials/nyc-taxi/nyc-taxi.Rmd b/tutorials/nyc-taxi/nyc-taxi.Rmd deleted file mode 100644 index 658fcc8d..00000000 --- a/tutorials/nyc-taxi/nyc-taxi.Rmd +++ /dev/null @@ -1,70 +0,0 @@ ---- -title: "NYC Taxi" -author: "ZJ" -date: "8/12/2019" -output: html_document ---- - -```{r setup, include=FALSE} -knitr::opts_chunk$set(echo = TRUE) -``` - - - -```{r} -library(disk.frame) -setup_disk.frame() -options(future.globals.maxSize = Inf) -``` - -## Loading the data -```{r} -system.time(nyc.df <- csv_to_disk.frame(list.files("c:/data/nyc-taxi-data/",pattern="*.csv", full.names = TRUE))) -``` - -```{r} -sapply(list.files("c:/data/nyc-taxi-data/",pattern="*.csv", full.names = TRUE), function(x) { - ncol(data.table::fread(x, nrows = 2)) -}) -``` - -```{r} -nyc.df = disk.frame("nyc.df") -a = nyc.df[,uniqueN(Dispatching_base_num), keep=names(nyc.df)[1]] -a -``` - - -```{r} -nyc.df = disk.frame("nyc.df") -head(nyc.df) -``` - - -```{r} -library(magrittr) -library(lubridate) -system.time(nyc2.df <- nyc.df %>% - cmap(function(chunk) { - chunk[!is.na(Dispatching_base_number), Dispatching_base_num := Dispatching_base_number] - chunk[, Dispatching_base_number:=NULL] - chunk[, Pickup_DateTime := lubridate::as_datetime(Pickup_DateTime)] - chunk[!is.na(Pickup_date), Pickup_DateTime := lubridate::as_datetime(Pickup_date)] - chunk[,Pickup_date:=NULL] - chunk - }, outdir="nyc2.df", lazy = FALSE, overwrite = TRUE)) -``` - -```{r} -nyc2.df = disk.frame("nyc2.df") -head(nyc2.df) -``` - - -```{r} -system.time(shard(nyc2.df, shardby = "Dispatching_base_num", outdir="nyc.sharded.df")) -``` - -```{r} - -``` \ No newline at end of file diff --git a/tutorials/readme.r b/tutorials/readme.r deleted file mode 100644 index bc7d241e..00000000 --- a/tutorials/readme.r +++ /dev/null @@ -1,94 +0,0 @@ -#install.packages(c("fst","future","data.table")) -if(!require(devtools)) install.packages("devtools") -if(!require(disk.frame)) devtools::install_github("xiaodaigh/disk.frame") - -library(dplyr) -library(disk.frame) -library(data.table) -library(fst) -#library(future) -#library(future.apply) -#library(data.table) -nworkers = parallel::detectCores(logical=F) -cat(nworkers," workers\n") -#plan(multiprocess, workers = nworkers, gc = T) -#options(future.globals.maxSize=Inf) - -rows_per_chunk = 1e7 -# generate synthetic data -tmpdir = "tmpfst" -fs::dir_delete(tmpdir) - -# write out 2*nworkers chunks -pt = proc.time() -df = disk.frame(tmpdir) -purrr::walk(1:(nworkers*2), function(ii) { - system.time(ab <- data.table(a = runif(rows_per_chunk), b = runif(rows_per_chunk)) ) #102 seconds - add_chunk(df, ab, ii) -}) -cat("Generating data took: ", timetaken(pt), "\n") - - -# read and output the disk.frame as it to assess "sequential" read-write performance -pt = proc.time() -df2 <- map.disk.frame(df, ~.x, outdir = "tmpfst2", lazy = F, overwrite = T) -cat("Read and write took: ", timetaken(pt), "\n") - -# get first few rows -head(df) - -# get last few rows -tail(df) - -# number of rows -nrow(df) - -# number of columns -ncol(df) - - -# dplyr verbs ------------------------------------------------------------- -df = disk.frame(tmpdir) - -df %>% - summarise(suma = sum(a)) %>% # this does a count per chunk - collect(parallel = T) - -# need a 2nd stage to finalise summing -df %>% - summarise(suma = sum(a)) %>% # this does a count per chunk - collect(parallel = T) %>% - summarise(suma = sum(suma)) - -# filter -pt = proc.time() -system.time(df_filtered <- df %>% - filter(a < 0.1)) -cat("filtering a < 0.1 took: ", timetaken(pt), "\n") -nrow(df_filtered) - -# group by -pt = proc.time() -res1 <- df %>% - filter(b < 0.1) %>% - mutate(blt005 = b < 0.05) %>% - group_by(blt005, hard = T) %>% # hard group_by is slower but avoid a 2nd stage aggregation - summarise(suma = sum(a), n = n()) %>% - collect(parallel = T) -cat("group by took: ", timetaken(pt), "\n") - - -# keep only one var is faster -pt = proc.time() -res1 <- df %>% - srckeep("a") %>% #keeping only the column `a` from the input - summarise(suma = sum(a), n = n()) %>% - collect(parallel = T) -cat("summarise keeping only one column ", timetaken(pt), "\n") - -# same operation without keeping -pt = proc.time() -res1 <- df %>% - summarise(suma = sum(a), n = n()) %>% - collect(parallel = T) -cat("summarise without keeping", timetaken(pt), "\n") diff --git a/tutorials/surf_2019_02_demo/.gitignore b/tutorials/surf_2019_02_demo/.gitignore deleted file mode 100644 index bef9413c..00000000 --- a/tutorials/surf_2019_02_demo/.gitignore +++ /dev/null @@ -1,5 +0,0 @@ -.ipynb_checkpoints -surf_201902.nb.html -surf_201902.html -01_surf_201902.html -01_surf_201902.tex diff --git a/tutorials/surf_2019_02_demo/00_setup.r b/tutorials/surf_2019_02_demo/00_setup.r deleted file mode 100644 index 5cdf0dec..00000000 --- a/tutorials/surf_2019_02_demo/00_setup.r +++ /dev/null @@ -1,41 +0,0 @@ -library(glue) -library(purrr) -library(fst) -library(tidyr) -library(ggplot2) -library(stringr) -library(xgboost) -library(lubridate) -library(disk.frame) - -raw_perf_data_path = "C:/data/Performance_All/" -#raw_perf_data_path = "d:/data/Performance_All_small/" - -# where the outputs go -outpath = "c:/data/fannie_mae_disk_frame/" -#outpath = "d:/data/fannie_mae_disk_frame_small/" - -Performance_ColClasses = - c("character", "character", "character", "numeric", "numeric", "numeric", "numeric", - "numeric", "character", "character", "character", "character", "character", "character", - "character", "character", "character", "numeric", "numeric", "numeric", "numeric", "numeric", - "numeric", "numeric", "numeric", "numeric", "numeric", "numeric", "character", "numeric", - "character") -Performance_Variables = - c("LOAN_ID", "Monthly.Rpt.Prd", "Servicer.Name", "LAST_RT", "LAST_UPB", "Loan.Age", - "Months.To.Legal.Mat" , "Adj.Month.To.Mat", "Maturity.Date", "MSA", "Delq.Status", - "MOD_FLAG", "Zero.Bal.Code", "ZB_DTE", "LPI_DTE", "FCC_DTE","DISP_DT", "FCC_COST", - "PP_COST", "AR_COST", "IE_COST", "TAX_COST", "NS_PROCS", "CE_PROCS", "RMW_PROCS", - "O_PROCS", "NON_INT_UPB", "PRIN_FORG_UPB_FHFA", "REPCH_FLAG", "PRIN_FORG_UPB_OTH", - "TRANSFER_FLG") %>% tolower - -dfiles = dir(raw_perf_data_path, full.names = T) -short_dfiles = dir(raw_perf_data_path) - -Acquisitions_Variables = c("LOAN_ID", "ORIG_CHN", "Seller.Name", "ORIG_RT", "ORIG_AMT", "ORIG_TRM", "ORIG_DTE" - ,"FRST_DTE", "OLTV", "OCLTV", "NUM_BO", "DTI", "CSCORE_B", "FTHB_FLG", "PURPOSE", "PROP_TYP" - ,"NUM_UNIT", "OCC_STAT", "STATE", "ZIP_3", "MI_PCT", "Product.Type", "CSCORE_C", "MI_TYPE", "RELOCATION_FLG") %>% tolower() - -Acquisition_ColClasses = c("character", "character", "character", "numeric", "numeric", "integer", "character", "character", "numeric", - "numeric", "character", "numeric", "numeric", "character", "character", "character", "character", "character", - "character", "character", "numeric", "character", "numeric", "numeric", "character") diff --git a/tutorials/surf_2019_02_demo/01_surf_201902.rmd b/tutorials/surf_2019_02_demo/01_surf_201902.rmd deleted file mode 100644 index 418ef86f..00000000 --- a/tutorials/surf_2019_02_demo/01_surf_201902.rmd +++ /dev/null @@ -1,131 +0,0 @@ ---- -title: "Simple Fannie Mae Example" -output: - html_document: - df_print: paged - pdf_document: default ---- - -```{r} -# 2_exploratory.r -#source("../fannie_mae/00_setup.r") -source("00_setup.r") -``` - -# Assign a `disk.frame` -You use `disk.frame(path)` to assign a disk.frame - -```{r} -dir(file.path(outpath, "fm_with_harp")) %>% - head -``` - -Use the function `disk.frame` to assign a folder -```{r} -fm_with_harp = - disk.frame(file.path(outpath, "fm_with_harp")) - -fm_with_harp -``` - -## Simple info about disk.frame -```{r} -nrow(fm_with_harp) - -ncol(fm_with_harp) -``` - - -```{r} -head(fm_with_harp) -``` - -```{r} -tail(fm_with_harp) -``` - -## Simple group-by example - -I want create the `sum(default_12m)` group by month. The sql statement looks like - -```sql -select - monthly.rpt.prd, - sum(default_12m) as n_defaults, - count(*) as cnt, - n_default/cnt as odr # observed default rate -from - table -group by - monthly.rpt.prd; -``` -This analysis only uses two columns, namely `default_12m` and `monthly.rpt.prd`. So I use `srckeep` to ensure that only those two columns are loaded. - -The backend for disk.frame is fst which allows **random access** to columns AKA load only what you need! This is importance for performance! -```{r} -system.time(a_wh1 <- fm_with_harp %>% - srckeep(c("default_12m","monthly.rpt.prd")) %>% - group_by(monthly.rpt.prd) %>% - summarise( - N = n(), - n_defaults = sum(default_12m, na.rm = T))) -``` -The error recognises that the `shardkey` is `loan_id` which is not the group-by variable - `monthly.rpt.prd`. - -The operations returned almost instantly, this is because the computation was done lazily. For now operations are recorded. No computation will take place until `collect` or `compute` is called. - -Now **play** the recording by calling `collect`. This will take a minute. Back to the slides: - -```{r} -system.time(a_wh2 <- a_wh1 %>% collect) # 60~70 plugged in - -a_wh2 -``` - -```{r} -# temporary -# a_wh1 %>% -# srckeep(c("monthly.rpt.prd", "default_12m")) %>% -# map(function(chunk) { -# chunk[1,] -# }) %>% -# collect -``` - - -```{r} -# temporary -# a_wh1 %>% -# srckeep(c("monthly.rpt.prd", "default_12m")) %>% -# map(~{ -# .x[1,.SD] -# }) %>% - # collect -``` - - -once `collect` is called the resultant data is stored as a data.frame. - -However this is not the correct result, as the group by was performed within each chunk. Hence we need a second stage group by. The second group by takes no time at all, as everything was done in memory - -```{r} -system.time(a_wh3 <- a_wh2 %>% - group_by(monthly.rpt.prd) %>% - summarise( - odr = sum(n_defaults)/sum(N) - ) %>% - rename( - Date = monthly.rpt.prd, - `Observed Default Rate%` = odr - )) -``` - -Manipulate the data and plot -```{r} -a_wh4 = a_wh3 %>% - gather(key = type, value=rate, -Date) - -ggplot(a_wh4) + - geom_line(aes(x=Date, y = rate, colour = type)) + - ggtitle("Fannie Mae Observed Default Rate over time & HARP Conversion Rate") -``` diff --git a/tutorials/surf_2019_02_demo/10_d_Fannie_mae_1_8b.r b/tutorials/surf_2019_02_demo/10_d_Fannie_mae_1_8b.r deleted file mode 100644 index 81c1a330..00000000 --- a/tutorials/surf_2019_02_demo/10_d_Fannie_mae_1_8b.r +++ /dev/null @@ -1,6 +0,0 @@ -source("inst/fannie_mae/00_setup.r") - -fm_with_harp = - disk.frame(file.path(outpath, "fm_with_harp")) - -fm_with_harp diff --git a/tutorials/surf_2019_02_demo/10_d_Fannie_mae_1_8b.rmd b/tutorials/surf_2019_02_demo/10_d_Fannie_mae_1_8b.rmd deleted file mode 100644 index 4a6f760d..00000000 --- a/tutorials/surf_2019_02_demo/10_d_Fannie_mae_1_8b.rmd +++ /dev/null @@ -1,54 +0,0 @@ ---- -title: "Simple Fannie Mae Example" -output: - html_document: - df_print: paged - pdf_document: default ---- - -```{r} -source("../fannie_mae/00_setup.r") - -fm_with_harp = - disk.frame(file.path(outpath, "fm_with_harp")) - -fm_with_harp -``` - - -```{r} -system.time(a_wh1 <- fm_with_harp %>% - srckeep(c("default_12m","monthly.rpt.prd")) %>% - group_by(monthly.rpt.prd) %>% - summarise( - N = n(), - n_defaults = sum(default_12m, na.rm = T)) %>% - collect %>% - group_by(monthly.rpt.prd) %>% - summarise( - odr = sum(n_defaults)/sum(N) - ) %>% - rename( - Date = monthly.rpt.prd, - `Observed Default Rate%` = odr - )) -``` - - -```{r} -a_wh4 = a_wh1 %>% - gather(key = type, value=rate, -Date) - -ggplot(a_wh4) + - geom_line(aes(x=Date, y = rate, colour = type)) + - ggtitle("Fannie Mae Observed Default Rate over time & HARP Conversion Rate") -``` - -$$ -600 = a + b\times\log(\frac{1-p}{p}) = a + b\times\log(30)\\ -620 = a + b\times(\alpha + \vec{X}^T\vec{\beta}) = a + b\times\log(60) -$$ - -$$ -620 = a + b\times(\alpha + \vec{X}^T\vec{\beta}) = a + b\times\log(60) -$$ \ No newline at end of file diff --git a/tutorials/surf_2019_02_demo/10_e_AutoML_XGBoost_Scorecards.r b/tutorials/surf_2019_02_demo/10_e_AutoML_XGBoost_Scorecards.r deleted file mode 100644 index a87a8945..00000000 --- a/tutorials/surf_2019_02_demo/10_e_AutoML_XGBoost_Scorecards.r +++ /dev/null @@ -1,112 +0,0 @@ -source("inst/fannie_mae_10pct/00_setup.r") -library(disk.frame) -library(xgboost) -acqall_dev = disk.frame(file.path(outpath, "appl_mdl_data_sampled_dev")) - -add_var_to_scorecard = disk.frame:::add_var_to_scorecard - -# 5d develop a function to test all variables -check_which_is_best <- function(df, target, features, monotone_constraints, format_fns, weight=NULL) { - - prev_pred = NULL - ws = NULL - vars_scr = NULL - - while(length(features) > 0) { - # try every var - - res = furrr::future_map(1:length(features), ~add_var_to_scorecard( - #res = purrr::map(1:length(features), ~add_var_to_scorecard( - df, - target, - features[.x], - monotone_constraints = monotone_constraints[.x], - prev_pred = prev_pred, - format_fn = format_fns[[.x]], - weight, - save_model_fname = glue::glue("{features[.x]}.xgbm") - )) - - # which has the best auc - w = which.max(map_dbl(res, ~.x$auc)) - ws = c(ws, w) - feature_to_fit = features[w] - print(glue::glue("choosen: {feature_to_fit}")) - - var_scr1 = res[[w]] - eval(parse(text = glue::glue("vars_scr = c(vars_scr, list({feature_to_fit} = var_scr1))"))) - - features = features[-w] - format_fns = format_fns[-w] - monotone_constraints = monotone_constraints[-w] - - prev_pred = var_scr1$prev_pred - } - vars_scr -} - -num_vars = c("mi_pct", "orig_amt", "orig_rt", "ocltv", "mi_type", "cscore_c", "oltv", "dti", "orig_trm", "cscore_b", "num_bo", "num_unit", "orig_dte", "frst_dte") -num_vars_mon = c(0 , 1 , 1 , 1 , 0 , -1 , 1 , 1 , 1 , -1 , -1 , 0 , 0 , 0) -num_var_fmt_fn = c(map(1:10, ~base::I), map(1:2, ~as.numeric), map(1:2, ~function(x) { - as.numeric(substr(x, 4, 7)) -})) - -cat_vars = setdiff(names(acqall_dev), num_vars) %>% - setdiff( - c("loan_id", "default_next_12m", "first_default_date", "mths_to_1st_default", "zip_3", "weight")) - -pt = proc.time() -res_all = check_which_is_best( - acqall_dev, - "default_next_12m", - features = c(num_vars, cat_vars), - monotone_constraints = c(num_vars_mon, rep(0, length(cat_vars))), - format_fns = c(num_var_fmt_fn, map(1:length(cat_vars), ~base::I)), - weight = "weight" -) -timetaken(pt) - -aucs = map_dbl(res_all, ~.x$auc) -plot( - aucs, - main = "Fannie Mae Application Scorecard AUC", - ylab="AUC", - xlab = "Number of features") - -if(F) { - library(gganimate) - auc_dt = data.table(num_of_vars = 1:length(aucs), auc = aucs) - auc_dt %>% - ggplot(aes(num_of_vars, auc)) + - geom_point() + - transition_states( - num_of_vars, - transition_length = 2, - state_length = 1 - ) + - enter_fade() + - exit_shrink() + - ease_aes('sine-in-out') -} - -#saveRDS(res_all, "model.rds") - -if(F) { - rescat = check_which_is_best( - acqall_dev, - "default_next_12m", - c("seller.name", "oltv", "state", "dti", "mi_type"), - c(0 , 1 , 0 , 1 , 0 ), - map(1:5, ~base::I) - ) - - - system.time(res <- check_which_is_best( - acqall1, - "default_next_12m", - c("ocltv", "cscore_c", "mi_type", "oltv", "dti", "orig_trm", "cscore_b", "num_bo"), - c(1 , -1 , 0 , 1 , 1 , 1 , -1 , -1 ), - c(map(1:7, ~base::I), list(as.numeric)))) -} - - diff --git a/tutorials/surf_2019_02_demo/10_f_make_scorecard_visible.R b/tutorials/surf_2019_02_demo/10_f_make_scorecard_visible.R deleted file mode 100644 index 675a81c2..00000000 --- a/tutorials/surf_2019_02_demo/10_f_make_scorecard_visible.R +++ /dev/null @@ -1,118 +0,0 @@ -source("inst/fannie_mae/00_setup.r") -library(disk.frame) - -auc = disk.frame:::auc -acqall_val = disk.frame(file.path(outpath, "appl_mdl_data_sampled_val2")) -df = acqall_val -#the model -mdl = readRDS("model.rds") - -#' score one variable -score_one_var <- function(feature, x, bins, bias, format_fn = base::I) { - print(feature) - # numeric features (post format_fn) has only two columns in bins - if(ncol(bins) == 2) { - x1 = format_fn(x) - binsNA = evalparseglue("bins[is.na({feature}),]") - bins = evalparseglue("bins[!is.na({feature}),]") - score = bins$score[cut(x1, c(-Inf, bins[[2]], Inf))] - - if(nrow(binsNA) == 1) { - score[is.na(x1)] = binsNA[[1]] - } - } else { - x1 = format_fn(x) - x1df = evalparseglue("data.table({feature} = x1)") - x1df_scr = evalparseglue("merge(x1df, bins, by = '{feature}', all.x=T)") - score = x1df_scr$score - } - # if there are still na then assign the bias - #browser(expr = any(is.na(score))) - if(any(is.na(score))) { - warning(glue::glue("the feature `{feature}` has {sum(is.na(score))} unassigned scores out of {length(score)}, assigning them the bias = {bias}")) - score[is.na(score)] = bias - } - - score -} - - -#' Socre a scorecard -#' @param mdl the xg scorecard model -#' @param df a disk.frame -score_xg_scorecard <- function(df, mdl) { - res = furrr::future_map_dfc(mdl, ~{ - #res = purrr::map_dfc(mdl, ~{ - var = df %>% - srckeep(.x$feature) %>% - collect(parallel = F) - x = var[[1]] - feature = .x$feature - bins = .x$bins - bias = .x$bias - format_fn = .x$format_fn - score_one_var(feature, x, bins, bias, format_fn) - }) %>% rowSums - res -} - -plot_auc <- function(df, line = F) { - target = df %>% - srckeep("default_next_12m") %>% - collect(parallel = F) - - pt = proc.time() - target$score = score_xg_scorecard(df, mdl) - timetaken(pt) - - AUC = auc(target$default_next_12m, target$score) - GINI = 2*AUC-1 - - target[,negscore := -score] - setkey(target, negscore) - cts = target[,.(ctot = (1:.N)/.N, cbad = cumsum(default_next_12m)/sum(default_next_12m))] - - if(line == F) { - plot(cts[seq(1,.N, length.out=100),], type="l", xlim=c(0,1), - main=glue::glue("On Validation - AUC: {AUC %>% round(2)}; GINI: {GINI %>% round(2)}")) - abline(a=0,b=1,lty=2) - abline(h=0) - abline(v=0) - } else { - lines(cts[seq(1,.N, length.out=100),], lty = 3, col = "blue") - legend("topleft", c("val AUC", "random", "dev AUC"), lty=c(1,2,3), col=c("black","blue","black")) - } -} - -system.time(plot_auc(acqall_val)) - -# score on whole ---------------------------------------------------------- -acqall_dev = disk.frame(file.path(outpath, "appl_mdl_data_sampled_dev2")) -system.time(plot_auc(acqall_dev, line = T)) - -scorecard = map_dfr(mdl, ~{ - res = .x$bins - evalparseglue("res[,feature_lbl := as.character({.x$feature})]") - res[,variable := .x$feature] - res %>% - select(variable, feature_lbl, score ) %>% - mutate(score = round(-score*20/log(2))) -}) - -saveRDS(scorecard, "scorecard.rds") -scorecard = readRDS("scorecard.rds") -#View(scorecard) - -scorecard[, - low := c(-Inf, feature_lbl[-.N]), variable] -setnames(scorecard, "feature_lbl", "high") - -scorecard=scorecard[,.(variable, low, high, score)] - -DT::datatable(scorecard, options = list(pageLength=20)) - -# score on whole ---------------------------------------------------------- -# acqall = disk.frame(file.path(outpath, "appl_mdl_data")) -# system.time(plot_auc(acqall)) - - diff --git a/tutorials/surf_2019_02_demo/11_speedglm.R b/tutorials/surf_2019_02_demo/11_speedglm.R deleted file mode 100644 index 3e180cb9..00000000 --- a/tutorials/surf_2019_02_demo/11_speedglm.R +++ /dev/null @@ -1,51 +0,0 @@ -source("inst/fannie_mae/00_setup.r") -library(disk.frame) -library(speedglm) -library(biglm) - -acqall_dev = disk.frame(file.path(outpath, "appl_mdl_data")) -library(speedglm) - -head(acqall_dev) - -#' A streaming function for speedglm -#' @param df a disk.frame -stream_shglm <- function(df) { - i = 0 - is = sample(nchunks(df), replace = F) - function(reset = F) { - - if(reset) { - print("reset") - i <<- 0 - } else { - i <<- i + 1 - #if (i > 3) { - if (i > nchunks(df)) { - return(NULL) - } - print(glue::glue("streaming: {i}/{nchunks(df)}")) - return(get_chunk(df, is[i])) - #return(get_chunk(df, i)) - } - } -} - -acqall_dev1 = acqall_dev %>% delayed(~{ - .x[,oltv_band := cut(oltv, c(-Inf, seq(60,100,by=20)))] - .x[,dti_band := addNA(cut(dti, c(-Inf, c(20,100), Inf)))] - .x -}) - -streamacq = stream_shglm(acqall_dev1) - -m = bigglm( - default_next_12m ~ oltv_band-1, - data = streamacq, - family=binomial()) -summary(m) - -# not gonna work -shglm(default_next_12m ~ oltv_band -1 #+ dti_band + - 1 - , - datafun = streamacq, family=binomial()) \ No newline at end of file diff --git a/tutorials/surf_2019_02_demo/12_keras.r b/tutorials/surf_2019_02_demo/12_keras.r deleted file mode 100644 index e038fb09..00000000 --- a/tutorials/surf_2019_02_demo/12_keras.r +++ /dev/null @@ -1,180 +0,0 @@ -source("inst/fannie_mae_10pct/00_setup.r") -library(disk.frame) -library(keras) - -acqall_dev = disk.frame(file.path(outpath, "appl_mdl_data_sampled")) - -#' A streaming function for speedglm -#' @param df a disk.frame -stream_shglm <- function(df) { - i = 0 - is = sample(nchunks(df), replace = F) - function(reset = F) { - - if(reset) { - print("you've reset") - i <<- 0 - } else { - i <<- i + 1 - #if (i > 4) { - if (i > nchunks(df)) { - return(NULL) - } - print(glue::glue("{i}/{nchunks(df)}")) - return(get_chunk(df, is[i])) - #return(get_chunk(df, i)) - } - } -} - -acqall_dev1 = acqall_dev %>% delayed(~{ - .x[,oltv_band := cut(oltv, c(-Inf, 60,80, Inf))] - #.x[,scr_band := addNA(cut(cscore_b, c(-Inf, 627,700, Inf)), ifany=F)] - .x[,scr_band := addNA(cut(cscore_b, c(-Inf, 700,716,725,742,748,766,794, Inf)), ifany=F)] - - .x -}) - -if(F) { - aa = acqall_dev1 %>% collect - glm(default_next_12m ~ oltv_band + scr_band - 1, data=aa) -} - -head(acqall_dev1) - -# compile a kera model -build_model <- function() { - model <- keras_model_sequential() %>% - layer_dense(units = 2, activation = 'softmax') - - - model %>% compile( - loss = "categorical_crossentropy", - optimizer = 'sgd' - ) - - model -} - -# compiling the model ~ -system.time(model <- build_model()) - -#ol = levels(acqall_dev1[,oltv_band, keep=c("oltv","cscore_b")]) -#dl = levels(acqall_dev1[,scr_band, keep=c("oltv","cscore_b")]) - -ol = levels(get_chunk(acqall_dev1,1)[,oltv_band]) -dl = levels(get_chunk(acqall_dev1,1)[,scr_band]) - - -#for(i in 1:nchunks(acqall_dev1)) { -kk <- function() { - - j = 0 - done = F - ii = 0 - while(!done) { - j <- j + 1 - si = sample(nchunks(acqall_dev1), nchunks(acqall_dev1)*0.3) - osi = setdiff(1:nchunks(acqall_dev1), si) - - system.time(a <- map_dfr(si, ~{ - - ii <- ii + 1 - i = .x - if(ii %% 20 == 0) print(glue::glue("{j}:{ii} {Sys.time()}")) - - a = get_chunk(acqall_dev1, i) - #a[,.(sum(default_next_12m), .N),oltv_band] - - a1 = - cbind( - a[,keras::to_categorical(as.integer(oltv_band) - 1)] - #,a[,keras::to_categorical(as.integer(scr_band) - 1)] - ) - - at = a[,default_next_12m*1] - Y_train = keras::to_categorical(at) - - hist = model %>% fit( - a1, - Y_train, - epochs = 1, - validation_split = 0.2, - verbose = 0 - ) - - gw = get_weights(model) - - gwdt = as.data.table(gw[1]) - setnames(gwdt, names(gwdt), c("non_default", "default")) - gwdt$i =i - gwdt$var = c( - rep("oltv_band", length(ol)) - #, rep("scr_band", length(dl)) - ) - gwdt$band = c(ol,dl - ) - gwdt - })) - - some_chunks <- map(osi, ~{ - i = .x - a = get_chunk(acqall_dev1, i) - - a1 = - cbind( - a[,keras::to_categorical(as.integer(oltv_band) -1)] - #,a[,keras::to_categorical(as.integer(scr_band)-1)] - ) - a1 - }) %>% reduce(rbind) - - outcomes <- map(osi, ~{ - i = .x - a = get_chunk(acqall_dev, i, keep=c("default_next_12m")) - a[,default_next_12m] - }) %>% unlist - - auc = auc(outcomes, predict(model, some_chunks)[,2]) - - # scores - p = predict(model, diag( - length(ol) - #+length(dl) - ))[,2] - a_b = log(p/(1-p)) - - scalar = 20/log(2) - intercept = 20*log(536870912/15)/log(2) - - a = mean(a_b) - b = a_b - a - done <- length(unique(sign(diff(b)))) == 1 - - scrs = data.table(base_scr = round(a*scalar+intercept,0), scr = round(-b*scalar,0)) - scrs$var = c( - rep("oltv_band", length(ol)) - #, rep("scr_band", length(dl)) - ) - scrs$band = c(ol - , dl - ) - scrs = scrs[band != "bias"] - - scrs = scrs[,.(var, band, scr, base_scr)] - print(glue::glue("AUC: {auc}")) - print(scrs) - } - scrs -} - -pt = proc.time() -scrs = kk() -timetaken(pt) -View(scrs) - -#AUC: 0.599790504924901 -# var band scr base_scr -# 1: oltv_band (-Inf,60] 10 351 -# 2: oltv_band (60,80] 10 351 -# 3: oltv_band (80, Inf] -21 351 \ No newline at end of file diff --git a/tutorials/surf_2019_02_demo/13_logistic_regression.r b/tutorials/surf_2019_02_demo/13_logistic_regression.r deleted file mode 100644 index db822586..00000000 --- a/tutorials/surf_2019_02_demo/13_logistic_regression.r +++ /dev/null @@ -1,201 +0,0 @@ - -two_var = acqall_dev[,.(cscore_b, ocltv, mi_pct, default_next_12m)] - -all_vars = collect(acqall_dev) - -non_char = - names(all_vars)[sapply(all_vars, typeof) != c("character")] %>% - setdiff("first_default_date") - -# -# all_vars = tibble(all_vars) -# library(dplyr) -# cor( -# all_vars -# , use = "pairwise.complete.obs") - -a = sort(non_char) -alvarsn = all_vars[, ..a] - -cdt = cor(alvarsn, use = "pairwise.complete.obs") - -cdt2 = cdt[,which(colnames(cdt) == "default_next_12m")] - -cdt3 = data.table(names = names(cdt2), cor = cdt2, abs_cor = abs(cdt2)) - -cdt3 = cdt3[order(abs_cor, decreasing = T)] - -DT::datatable(cdt3) - - -corrplot::corrplot(cdt, method="color", diag=F) - -# Binning - -two_var1 = two_var[!is.na(ocltv),][order(ocltv),.(ocltv, default_next_12m)] - - -two_var2 = two_var1[, .(ndef = sum(default_next_12m), .N), ocltv] - -two_var3 = two_var2[,.(ocltv, ndef, N, GB_odds = cumsum(N)/cumsum(ndef))] - - -wm = which(two_var3$GB_odds == max(Filter(is.finite, two_var3$GB_odds))) -wm3 = wm -two_var3[,.(ocltv, GB_odds)] %>% - plot(main = "Cumulative Good/Bad Odds", - xlab="Originative LTV (ocltv)", - ylab="Observed Default Rate (ODR)", - ylim=c(0,600)) - -abline(v = two_var3[wm, ocltv], lty=2) -legend("bottomright", c("ODR", "max ODR location") - , lty=c(NA,2) - , pch = c(1,NA) - ) - -## plot -two_var4 = two_var3[-(1:wm),.(ocltv, N, ndef, GB_odds = cumsum(N)/cumsum(ndef))] - -two_var3[-(1:wm),.(ocltv, GB_odds = cumsum(N)/cumsum(ndef))] %>% - plot(main = "Cumulative Observed Default Rate", xlab="Originative LTV (ocltv)", - ylab="Observed Default Rate (ODR)", - xlim=c(0, 120), - ylim = c(0, 600)) - -lines(two_var3[, .(c(0,two_var3$ocltv[wm]), GB_odds[wm])]) - - -wm = which(two_var4$GB_odds == max(Filter(is.finite, two_var4$GB_odds))) - -abline(v = two_var4[wm, ocltv], lty=2) -legend("bottomright", c("ODR", "max ODR location") - , lty=c(NA,2) - , pch = c(1,NA) -) - -## plot 2 -two_var5 = two_var4[-(1:wm),.(ocltv, N, ndef, GB_odds = cumsum(N)/cumsum(ndef))] - -two_var4[-(1:wm),.(ocltv, GB_odds = cumsum(N)/cumsum(ndef))] %>% - plot(main = "Cumulative Observed Default Rate", xlab="Originative LTV (ocltv)", - ylab="Observed Default Rate (ODR)", - xlim=c(0, 120), - ylim = c(0, 600)) - -lines(two_var3[, .(c(0,two_var3$ocltv[wm3]), GB_odds[wm3])]) - -lines(two_var4[, .(c(two_var3$ocltv[wm3],two_var4$ocltv[wm]), GB_odds[wm])]) - -wm = which(two_var4$GB_odds == max(Filter(is.finite, two_var4$GB_odds))) - -legend("bottomright", c("ODR", "max ODR location") - , lty=c(NA,2) - , pch = c(1,NA) -) - -abline(v=51, lty=2) - -var1 = res_all[[1]] - -all_vars[, cscore_bin := cut( - all_vars[,var1$feature, with = F][[1]] , - var1$bin[[2]] %>% setdiff(c(701,713))) %>% addNA] - -bp1 = all_vars[order(cscore_bin),.(sum(default_next_12m)/.N), cscore_bin] -barplot( - bp1$V1, - names.arg = bp1[[1]], - main = "cscore_b bin Default Rate" -) - -setnames(bp1,"V1","def_rate") - -overall_df = all_vars[,sum(default_next_12m)/.N] - -bp1[,whole := overall_df] - -bp1[,woe := log((1-def_rate)/def_rate) - log((1-whole)/whole)] - -DT::datatable(bp1) - -glm(def_rate ~ I(-woe), data = bp1, family = binomial) - -all_vars[,csr_woe := bp1$woe[cscore_bin]] - -DT::datatable(all_vars[1:10,.(default_next_12m, cscore_bin, csr_woe)]) - -glm( - default_next_12m ~ csr_woe, - family = binomial, - data = all_vars -) - -View(two_var) - - - - - - -var2 = res_all[[3]] -all_vars[, ocltv_bin := cut( - all_vars[,var2$feature, with = F][[1]], - var2$bins[[2]]) %>% addNA] - -bp2 = all_vars[order(ocltv_bin),.(sum(default_next_12m)/.N), ocltv_bin] -barplot(bp2$V1, names.arg = bp2[[1]], main = "ocltv Orig LTV Default Rate") - - -var2 = res_all[[9]] -var2$feature -all_vars[, dti_bin := cut( - all_vars[,var2$feature, with = F][[1]], - var2$bins[[2]]) %>% addNA] - -dtrain <- xgboost::xgb.DMatrix( - label = two_var[,default_next_12m], - data = as.matrix(two_var[,cscore_b])) - -m2 <- xgboost::xgboost( - data=dtrain, - monotone_constraints = -1, # awesome!!! - nrounds = 1, # one tree ONLY - eta = 1, # learning rate - - - - objective = "binary:logitraw", - tree_method="exact", - max_depth = 2, - base_score = two_var[,sum(default_next_12m)/.N] -) - -xgboost::xgb.plot.tree(model=m2) - -plot(m2) - -bp2 = all_vars[order(dti_bin),.(sum(default_next_12m)/.N), dti_bin] -barplot(bp2$V1, names.arg=bp2[[1]], main="dti Debt-to-income Ratio Default Rate") - - -dt <- xgb.model.dt.tree(model = m2) - - -bin_eg = data.table( - Low = c(-Inf, 597, 657, 716), - High = c(597, 657, 716, Inf) -) - -View(bin_eg) - - -m = glm( - default_next_12m ~ - cscore_bin + - ocltv_bin + - dti_bin, - family = binomial, - data=all_vars) - -DT::datatable(broom::tidy(m), options=list(pageLength=16)) diff --git a/tutorials/surf_2019_02_demo/surf_2019_02_demo.ipynb b/tutorials/surf_2019_02_demo/surf_2019_02_demo.ipynb deleted file mode 100644 index 93e1c58b..00000000 --- a/tutorials/surf_2019_02_demo/surf_2019_02_demo.ipynb +++ /dev/null @@ -1,68 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# 2_exploratory.r\n", - "source(\"inst/fannie_mae/0_setup.r\")\n", - "\n", - "fm_with_harp = disk.frame(file.path(outpath, \"fm_with_harp\"))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "head(fm_with_harp)\n", - "\n", - "nrow(fm_with_harp)\n", - "\n", - "# need a two stage summary\n", - "system.time(a_wh1 <- fm_with_harp %>% \n", - " srckeep(c(\"default_12m\",\"monthly.rpt.prd\")) %>% \n", - " group_by(monthly.rpt.prd) %>% \n", - " summarise(\n", - " N = n(), \n", - " n_defaults = sum(default_12m, na.rm = T)) %>% \n", - " collect(parallel = T) %>%\n", - " group_by(monthly.rpt.prd) %>% \n", - " summarise(\n", - " odr = sum(n_defaults)/sum(N)\n", - " ) %>% \n", - " rename(\n", - " Date = monthly.rpt.prd,\n", - " `Observed Default Rate%` = odr\n", - " ))\n", - " \n", - "\n", - "a_wh2 = a_wh1 %>% gather(key = type, value=rate, -Date)\n", - "\n", - "ggplot(a_wh2) + \n", - " geom_line(aes(x=Date, y = rate, colour = type)) +\n", - " ggtitle(\"Fannie Mae Observed Default Rate over time & HARP Conversion Rate\")" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "R", - "language": "R", - "name": "ir" - }, - "language_info": { - "codemirror_mode": "r", - "file_extension": ".r", - "mimetype": "text/x-r-source", - "name": "R", - "pygments_lexer": "r", - "version": "3.5.3" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/tutorials/tutorial.r b/tutorials/tutorial.r deleted file mode 100644 index c3e459ff..00000000 --- a/tutorials/tutorial.r +++ /dev/null @@ -1,124 +0,0 @@ -library(disk.frame) -setup_disk.frame() - -a = disk.frame::disk.frame("c:/data/fannie_mae_disk_frame/fm.df/") - - -nrow(a) -ncol(a) - -head(a) - -system.time(asample <- a %>% - srckeep("loan_id") %>% - filter(!duplicated(loan_id)) %>% - sample_frac(0.01) %>% - collect) - -asample = data.frame(loan_id = asample) - -system.time( - a1 <-a %>% - cmap(~{ - inner_join(.x, asample, by = "loan_id") - }, - lazy = FALSE, - outdir = "c:/data/fannie_mae_disk_frame/fm1pct.df/", - overwrite = TRUE)) - - -a2 = collect(a1) - -data.table::fwrite(a2, "a2.csv") -arrow::write_parquet(a2, "a2.parquet") -fst::write_fst(a2, "a2.fst") - -a2$date = as.Date(a2$monthly.rpt.prd, "%m/%d/%Y") -a2$delq.status = as.numeric(a2$delq.status) - -a2[,.N,delq.status] -a2[is.na(delq.status),delq.status := 0] - -head(a2) - -setkey(a2, loan_id, date) - -system.time( - a3 <- - setDT(a2)[order(date), apply(.SD[,shift(delq.status, (1:24), type="lead")], 1 , function(x) max(x, na.rm=TRUE)), loan_id] -) - -setnames(a3, names(a3), c("loan_id", "default_n24m")) -setDT(a3)[is.infinite(default_n24m), default_n24m := 0] -a3[,.N, default_n24m] - -a2$default_n24m = a3$default_n24m >= 4 - -nrow(a2) - -a2[,.N, default_n24m] - -arrow::write_parquet(a2, "a2.parquet") -arrow::write_parquet(a2, "data.parquet") -fst::write_fst(a2, "a2.fst") -data.table::fwrite(a2, "a2.csv") - -table(sapply(a2, mode)) - -a2 = a2 %>% - janitor::clean_names() - -str(a2$date) -table(sapply(a2, class)) - -map2(a2, names(a2), ~{ - cx = class(.x) - list( - name = .y, - type = - ifelse( - cx == "character", - "text", - ifelse( - cx == "Date", - "datetime", - ifelse( - cx == "logical", - "bool", - ifelse( - cx == "numeric", - "numeric", - "error" - ) - ) - )), - semantic_type = - ifelse( - cx == "character", - "TextColumn", - ifelse( - cx == "Date", - "DateTimeColumn", - ifelse( - cx == "logical", - "BooleanColumn", - ifelse( - cx == "numeric", - "NumericColumn", - "error" - ) - ) - )) - ) %>% jsonlite::toJSON(auto_unbox = TRUE) -}) %>% unlist %>% - paste0(collapse = ", ") -> outjson - -res = glue::glue('{"column": [|outjson|]}', .open = "|", .close = "|") - -write(res, "schema.json") - - - - - - diff --git a/tutorials/user_2020/useR! 2020 Tutorial part 1/.gitignore b/tutorials/user_2020/useR! 2020 Tutorial part 1/.gitignore deleted file mode 100644 index 2960c12d..00000000 --- a/tutorials/user_2020/useR! 2020 Tutorial part 1/.gitignore +++ /dev/null @@ -1 +0,0 @@ -useR!-2020-Tutorial-part-1.html diff --git a/tutorials/user_2020/useR! 2020 Tutorial part 1/useR! 2020 Tutorial part 1.Rmd b/tutorials/user_2020/useR! 2020 Tutorial part 1/useR! 2020 Tutorial part 1.Rmd deleted file mode 100644 index 918ac490..00000000 --- a/tutorials/user_2020/useR! 2020 Tutorial part 1/useR! 2020 Tutorial part 1.Rmd +++ /dev/null @@ -1,147 +0,0 @@ ---- -title: "useR! 2020 Tutorial Part 1" -output: learnr::tutorial -runtime: shiny_prerendered ---- - -```{r setup, include=FALSE} -library(learnr) -knitr::opts_chunk$set(echo = FALSE) -``` - -## Intro - -### What is disk.frame for? - -### Installation - -```{r installation, exercise = TRUE} -# to install CRAN version -install.packages("disk.frame") - -# Uncomment below to install latest development version -# install.packages("remotes") -# remotes::install_github("xiaodaigh/disk.frame") -``` - -On some platforms, it helps to specify the repo if you encounter an -```r -install.packages("disk.frame", repo="https://cran.rstudio.com") -``` -### Download data? - -### Generate data? - - -## Starting disk.frame - -### Exercise - -{disk.frame} can use multiple cores and may transfer data between **workers**. - -So there are some setting up code that is recommended for all users. - -```{r setting_up, exercise=TRUE} -library(disk.frame) - -``` - -You should see a message like this from the console -``` -# this will set up disk.frame with multiple workers -setup_disk.frame() -# this will allow unlimited amount of data to be passed from worker to worker -options(future.globals.maxSize = Inf) -``` - - -```{r setting_up1, exercise=TRUE} -# this will set up disk.frame with multiple workers -setup_disk.frame() -# this will allow unlimited amount of data to be passed from worker to worker -options(future.globals.maxSize = Inf) -``` - -If you forget - -```{r setting_up2, exercise=TRUE} -disk.frame::show_boilerplate() - -# or equivalently -disk.frame::show_ceremony() -``` - -TODO insert GIF of inserting code into RStudio. - -## Ingesting Data - -To use {disk.frame} as your data processing engine, you need to store your data in the disk.frame format. - -### The `disk.frame` format - -The {disk.frame} format is a folder containing multiple [`fst`](https://www.fstpackage.org/) files. And the files are named `1.fst`, `2.fst` etc, where each `fst` file is called a **chunk**. - -TODO add quiz - -```{r ingest_data_frame, exercise=TRUE} -mtcars.df = as.disk.frame(mtcars) -mtcars.df -``` - -This works but if your data fits into RAM, you probably don't need {disk.frame}. But perhaps you data.frame fits into RAM but you can't merge it with another dataset, because that would require additional RAM. So storing it as disk.frame before merging can help. - -## Ingesting CSV - -```{r print-limit, exercise=TRUE, exercise.eval=TRUE} -data.table::fwrite(mtcars, "mtcars.csv") -# alternatively -#write.csv(mtcars, "mtcars.csv") - -mtcars.df = csv_to_disk.frame("mtcars.csv") -mtcars.df -``` - -Notice how `mtcars.df` are stored in a temporary folder? You can move it using - -```{r} -move_to(mmtcars.df, "/some/path/permanent") -``` - -or - -```{r} -mtcars.df = csv_to_disk.frame( - "mtcars.csv", - outdir = "/some/path/permanent") - -lib1 = libname("some_folder") - -lib1$mtcars2.df = lib1$mtcars.df %>% filter(mpg < 0.5) -``` - -TODO srckeep("") - - -### Quiz - -*You can include any number of single or multiple choice questions as a quiz. Use the `question` function to define a question and the `quiz` function for grouping multiple questions together.* - -Some questions to verify that you understand the purposes of various base and recommended R packages: - -```{r quiz} -quiz( - question("Which package contains functions for installing other R packages?", - answer("base"), - answer("tools"), - answer("utils", correct = TRUE), - answer("codetools") - ), - question("Which of the R packages listed below are used to create plots?", - answer("lattice", correct = TRUE), - answer("tools"), - answer("stats"), - answer("grid", correct = TRUE) - ) -) -``` - diff --git a/tutorials/vs-vaex/ok.py b/tutorials/vs-vaex/ok.py deleted file mode 100644 index 64effa8b..00000000 --- a/tutorials/vs-vaex/ok.py +++ /dev/null @@ -1,5 +0,0 @@ -import vaex - -df = vaex.example() -df # Since this is the last statement in a cell, it will print the DataFrame in a nice HTML format. - diff --git a/utils/build_utils.R b/utils/build_utils.R index 33103cf0..4e955594 100644 --- a/utils/build_utils.R +++ b/utils/build_utils.R @@ -1,21 +1,3 @@ -# df_bookdown_build <- function() { -# rmds = list.files("vignettes", pattern = "*.Rmd") -# sapply(rmds, function(file) { -# fs::file_copy( -# file.path("book", file), -# file.path("vignettes", file), -# overwrite = TRUE -# ) -# }) -# if(fs::dir_exists("book/_bookdown_files")) { -# fs::dir_delete("book/_bookdown_files") -# } -# while(fs::dir_exists("book/_bookdown_files")) { -# Sys.sleep(1) -# } -# rmarkdown::render_site("book", encoding = 'UTF-8') -# } - df_build_site <- function() { df_build_readme() df_setup_vignette() @@ -205,6 +187,12 @@ df_release <- function() { devtools::release() } +df_ultimate <- function() { + df_check() + df_release() + df_build_site() +} + if(F) { df_check() }