diff --git a/DESCRIPTION b/DESCRIPTION
index 1301c323..51eedc85 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -1,8 +1,8 @@
 Type: Package
 Package: disk.frame
 Title: Larger-than-RAM Disk-Based Data Manipulation Framework
-Version: 0.6.0
-Date: 2022-01-31
+Version: 0.7.2
+Date: 2022-03-07
 Authors@R: c(
   person("Dai", "ZJ", email = "zhuojia.dai@gmail.com", role = c("aut", "cre")),
   person("Jacky", "Poon", role = c("ctb"))
@@ -30,9 +30,11 @@ Imports:
     bit64,
     benchmarkme,
     purrr (>= 0.3.2),
-    rlang
+    globals,
+    rlang,
+    arrow
 Depends:
-    R (>= 3.4),
+    R (>= 4.0),
     dplyr (>= 1.0.0)
 Suggests:
     nycflights13,
@@ -51,4 +53,4 @@ LinkingTo:
 RoxygenNote: 7.1.2
 Encoding: UTF-8
 URL: https://diskframe.com
-BugReports: https://github.com/xiaodaigh/disk.frame/issues
+BugReports: https://github.com/DiskFrame/disk.frame/issues
diff --git a/NAMESPACE b/NAMESPACE
index 2def4912..03a29bef 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -82,6 +82,7 @@ export(delete)
 export(df_ram_size)
 export(dfglm)
 export(disk.frame)
+export(disk.frame_to_parquet)
 export(distribute)
 export(evalparseglue)
 export(foverlaps.disk.frame)
@@ -113,6 +114,7 @@ export(nchunks)
 export(ncol)
 export(nrow)
 export(overwrite_check)
+export(partition_filter)
 export(quantile_df.chunk_agg.disk.frame)
 export(quantile_df.collected_agg.disk.frame)
 export(rbindlist.disk.frame)
@@ -138,6 +140,7 @@ import(fs)
 import(fst)
 import(stringr)
 importFrom(Rcpp,evalCpp)
+importFrom(arrow,write_parquet)
 importFrom(benchmarkme,get_ram)
 importFrom(bigreadr,get_split_files)
 importFrom(bigreadr,split_file)
@@ -165,6 +168,7 @@ importFrom(dplyr,full_join)
 importFrom(dplyr,glimpse)
 importFrom(dplyr,group_by)
 importFrom(dplyr,group_by_drop_default)
+importFrom(dplyr,group_map)
 importFrom(dplyr,group_vars)
 importFrom(dplyr,groups)
 importFrom(dplyr,inner_join)
@@ -195,6 +199,7 @@ importFrom(future,nbrOfWorkers)
 importFrom(future,plan)
 importFrom(future,sequential)
 importFrom(future.apply,future_lapply)
+importFrom(globals,findGlobals)
 importFrom(glue,glue)
 importFrom(jsonlite,fromJSON)
 importFrom(jsonlite,toJSON)
@@ -215,5 +220,6 @@ importFrom(utils,capture.output)
 importFrom(utils,head)
 importFrom(utils,memory.limit)
 importFrom(utils,tail)
+importFrom(utils,type.convert)
 importFrom(utils,unzip)
 useDynLib(disk.frame)
diff --git a/NEWS.md b/NEWS.md
index e075c6da..501a74ab 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -1,3 +1,19 @@
+# disk.frame 0.7.2
+* Bugfixes Github 384
+
+# disk.frame 0.7.1
+* Bugfixes
+* And ability to convert to Parquet 
+
+# disk.frame 0.7
+* Partitioned by folder
+* Updated R version to 4
+
+# disk.frame 0.6.1
+* Fixed bug with data.table syntax
+* Auto detection of srckeep in group by
+* Global detection for group by and summarise
+
 # disk.frame 0.6
 * Much better NSE support in disk.frame!
 * removed `hard_arrange` and `hard_group_by`
diff --git a/R/chunk_mapper.r b/R/chunk_mapper.r
index 1b0d3ff7..dc6b0556 100644
--- a/R/chunk_mapper.r
+++ b/R/chunk_mapper.r
@@ -41,11 +41,36 @@ create_chunk_mapper <- function(chunk_fn, warning_msg = NULL, as.data.frame = FA
     if(!is.null(warning_msg)) {
       warning(warning_msg)
     }
+    dotdotdot = rlang::enexprs(...)
+    
+    # convert any quosure to labels
+    for (i in seq_along(dotdotdot)) {
+      dotdotdot[[i]] <- dotdotdot[[i]] %>% 
+        rlang::quo_squash()
+    }
+    
+    args_str = mapply(function(name, val) {
+      rhs = deparse(val) %>% 
+        paste0(collapse = "")
+      if(name != "") {
+        sprintf("%s=%s", name, rhs)
+      } else {
+        rhs
+      }
+    }, names(dotdotdot), dotdotdot) %>% 
+      paste0(collapse = ", ")
+    
+    if (args_str == "") {
+      code = parse(text="chunk_fn(.disk.frame.chunk)")[[1]]
+    } else {
+      code = parse(text=sprintf("chunk_fn(.disk.frame.chunk, %s)", args_str))[[1]]
+    }
+    
     
     # you need to use list otherwise the names will be gone
-    code = substitute(chunk_fn(.disk.frame.chunk, ...))
+    # code = substitute(chunk_fn(.disk.frame.chunk, ...))
     
-    if (paste0(deparse(code), collapse="") == "chunk_fn(NULL)") {
+    if (paste0(deparse(code), collapse="") == "chunk_fn(.disk.frame.chunk, NULL)") {
       globals_and_pkgs = future::getGlobalsAndPackages(expression(chunk_fn()))
     } else {
       globals_and_pkgs = future::getGlobalsAndPackages(code)
diff --git a/R/collect.r b/R/collect.r
index cf5add15..98347581 100644
--- a/R/collect.r
+++ b/R/collect.r
@@ -25,14 +25,61 @@
 #' @rdname collect
 collect.disk.frame <- function(x, ..., parallel = !is.null(attr(x,"recordings"))) {
   cids = get_chunk_ids(x, full.names = TRUE, strip_extension = FALSE)
+  # obtain filters from structure
+  partitioned_paths_or_not = get_partition_paths(x)
+  
+  if (partitioned_paths_or_not$is_partitioned) {
+    partitioned_paths = partitioned_paths_or_not$paths
+    if(length(partitioned_paths) >= 1) {
+      # filter the cids based on the paths
+      tmp = data.frame(paths = cids) %>% 
+        mutate(dirname = dirname(paths)) %>% 
+        inner_join(data.frame(dirname = sapply(partitioned_paths, tools::file_path_as_absolute)), by = "dirname")
+      cids = tmp$paths
+    }
+  }
+  
   if(nchunks(x) > 0) {
     if(parallel) {
-      tmp = future.apply::future_lapply(cids, function(.x) {
-          get_chunk(x, .x, full.names = TRUE)
-      }, future.seed = TRUE)
+      tmp<-future.apply::future_lapply(cids, function(.x, meh) {
+          if(partitioned_paths_or_not$is_partitioned) {
+            dirpath = dirname(.x)
+            tmp2 = partitioned_paths_or_not$df %>%
+              mutate(fullpath = file.path(attr(x, "path") %>% tools::file_path_as_absolute(), .disk.frame.sub.path)) 
+            
+            tmp2a = tmp2 %>% 
+              filter(fullpath == dirpath) %>% 
+              mutate(.check=1)
+            
+            stopifnot(nrow(tmp2a) == 1)
+            
+            tmp3 = get_chunk.disk.frame(x, .x, full.names = TRUE, partitioned_info = tmp2a)
+            return(tmp3)
+          } else {
+            return(get_chunk.disk.frame(x, .x, full.names = TRUE))
+          }
+      }, future.seed = NULL)
       return(rbindlist(tmp))
     } else {
-      purrr::map_dfr(cids, ~get_chunk(x, .x, full.names = TRUE))
+      tmp<-lapply(cids, function(.x, meh) {
+        if(partitioned_paths_or_not$is_partitioned) {
+          dirpath = dirname(.x)
+          tmp2 = partitioned_paths_or_not$df %>%
+            mutate(fullpath = file.path(attr(x, "path") %>% tools::file_path_as_absolute(), .disk.frame.sub.path)) 
+          
+          tmp2a = tmp2 %>% 
+            filter(fullpath == dirpath) %>% 
+            mutate(.check=1)
+          
+          stopifnot(nrow(tmp2a) == 1)
+          
+          tmp3 = get_chunk.disk.frame(x, .x, full.names = TRUE, partitioned_info = tmp2a)
+          return(tmp3)
+        } else {
+          return(get_chunk.disk.frame(x, .x, full.names = TRUE))
+        }
+      })
+      return(rbindlist(tmp))
     }
   } else {
     data.table()
diff --git a/R/collect.summarized_disk.frame.r b/R/collect.summarized_disk.frame.r
index edd686a4..58c7a500 100644
--- a/R/collect.summarized_disk.frame.r
+++ b/R/collect.summarized_disk.frame.r
@@ -13,6 +13,7 @@
 #' @importFrom data.table data.table as.data.table
 #' @importFrom purrr map_dfr
 #' @importFrom dplyr collect select mutate
+#' @importFrom globals findGlobals
 #' @return collect return a data.frame/data.table
 #' @examples
 #' cars.df = as.disk.frame(cars)
@@ -26,7 +27,36 @@
 collect.summarized_disk.frame <-
   function(x, ..., parallel = !is.null(attr(x, "recordings"))) {
     dotdotdot <- attr(x, 'summarize_code')
+    group_by_vars = attr(x, "group_by_cols")
+    
+    # look at the group by and summaries codes and figure out which columns need to be 
+    # srckeep
+    df_to_find_cols = fst::read_fst(get_chunk_ids(x, full.names = TRUE)[1], from=1, to=1)
+    
+    cols_in_summ = lapply(dotdotdot, function(one) {
+      globals::findGlobals(one, envir = list2env(df_to_find_cols, parent=globalenv()))
+    }) %>% unlist %>% unique
+    
+    cols_in_group_by = lapply(group_by_vars, function(one) {
+      globals::findGlobals(one, envir = list2env(df_to_find_cols, parent=globalenv()))
+    }) %>% unlist %>% unique
+    
+    cols_used = c(cols_in_summ, cols_in_group_by) %>% unique
+    src_keep_cols = intersect(names(df_to_find_cols), cols_used)
+    
+    # are there any variables used in the group by or summarise that is not present in the original data?
+    # if yes then that indicates this could be more complicated e.g. a new var was created with mutate
+    extra_vars = setdiff(cols_used, names(df_to_find_cols))
+    if(length(extra_vars) > 0) {
+      warning(sprintf(
+        "These columns that appear in the group-by and summarise does not appear in the original data set: %s. This set of action is too hard for disk.frame to figure out the `srckeep` automatically, you must do the `srckeep` manually."
+        , paste0(extra_vars, collapse = ", ")))
+    } else {
+      x = srckeep(x, src_keep_cols)
+    }
     
+    
+  
     # make a copy
     dotdotdot_chunk_agg <- dotdotdot
     dotdotdot_collected_agg <- dotdotdot
@@ -74,8 +104,18 @@ collect.summarized_disk.frame <-
       }
     }
     
+    group_by_globals_list = attr(x_as.disk.frame, "group_by_globals_and_pkgs")$globals
+    
+    if(is.null(group_by_globals_list)) {
+      eval_clos = parent.frame()
+    } else {
+      eval_clos = list2env(group_by_globals_list, parent=parent.frame())
+    }
+      
     # TODO add appropriate environment
-    tmp_df = eval(first_stage_code)
+    # tmp_df = eval(first_stage_code, envir=environment(), enclos = eval_clos
+    tmp_df = eval(first_stage_code, group_by_globals_list)
+    
     
     n_summ_args = length(dotdotdot_chunk_agg)
     
@@ -103,7 +143,9 @@ collect.summarized_disk.frame <-
       )
     }
     
-    tmp2 = collect(eval(parse(text = chunk_summ_code_str)))
+    summarize_globals_list = attr(x_as.disk.frame, "summarize_globals_and_pkgs")$globals
+    
+    tmp2 = collect(eval(parse(text = chunk_summ_code_str), envir = summarize_globals_list))
     
     second_stage_code = eval(parse(text = sprintf(
       "quote(group_by(tmp2, %s))", paste0(rep_len("NULL", n_grp_args), collapse = ", ")
@@ -111,7 +153,12 @@ collect.summarized_disk.frame <-
     
     if (n_grp_args >= 1) {
       for (i in 1:n_grp_args) {
-        second_stage_code[[i + 2]] = group_by_vars[[i]]
+        second_stage_code[[i + 2]] = group_by_vars[[i]] %>% 
+          deparse() %>% 
+          paste0(collapse="") %>% 
+          sprintf("`%s`", .) %>% 
+          parse(text=.) %>% 
+          .[[1]]
       }
     }
     
diff --git a/R/data.table.r b/R/data.table.r
index 0d12184f..9d648121 100644
--- a/R/data.table.r
+++ b/R/data.table.r
@@ -24,42 +24,7 @@
   
   code = substitute(chunk[...])
   
-  # sometimes the arguments could be empty
-  # in a recent version of globals that would cause a fail
-  # to avoid the fail remove them from the test
-  #dotdotdot_for_find_global = dotdotdot[!sapply(sapply(dotdotdot, as.character), function(x) all(unlist(x) == ""))]
-  
-  #ag = globals::findGlobals(dotdotdot_for_find_global)
-  #ag = setdiff(ag, "") # "" can cause issues with future # this line no longer needed
-  
-  
-  # you need to use list otherwise the names will be gone
-  if (paste0(deparse(code), collapse="") == "chunk_fn(NULL)") {
-    globals_and_pkgs = future::getGlobalsAndPackages(expression(chunk_fn()))
-  } else {
-    globals_and_pkgs = future::getGlobalsAndPackages(code)
-  }
-  
-  
-  global_vars = globals_and_pkgs$globals
-  
-  env = parent.frame()
-  
-  done = identical(env, emptyenv()) || identical(env, globalenv())
-  
-  # keep adding global variables by moving up the environment chain
-  while(!done) {
-    tmp_globals_and_pkgs = future::getGlobalsAndPackages(code, envir = env)
-    new_global_vars = tmp_globals_and_pkgs$globals
-    for (name in setdiff(names(new_global_vars), names(global_vars))) {
-      global_vars[[name]] <- new_global_vars[[name]]
-    }
-    
-    done = identical(env, emptyenv()) || identical(env, globalenv())
-    env = parent.env(env)
-  }
-  
-  globals_and_pkgs$globals = global_vars
+  globals_and_pkgs = find_globals_recursively(code, parent.frame())
   
   res = future.apply::future_lapply(get_chunk_ids(df, full.names = TRUE), function(chunk_id) {
   #res = lapply(get_chunk_ids(df, full.names = TRUE), function(chunk_id) {
diff --git a/R/disk.frame-to-parquet.r b/R/disk.frame-to-parquet.r
new file mode 100644
index 00000000..d9ec2e45
--- /dev/null
+++ b/R/disk.frame-to-parquet.r
@@ -0,0 +1,35 @@
+#' A function to convert a disk.frame to parquet format
+#' @importFrom arrow write_parquet
+#' @export
+#' @param df a disk.frame or a path to a disk.frame
+#' @param outdir the path to save the parquet files
+disk.frame_to_parquet <- function(df, outdir) {
+  if("disk.frame" %in% class(df)) {
+    path = attr(df, "path")
+  } else {
+    path = df
+  }
+  
+  path = normalizePath(path)
+  fst_files = normalizePath(list.files(path, "fst$", full.names = TRUE, recursive=TRUE))
+  
+  if(!fs::dir_exists(outdir)) {
+    fs::dir_create(outdir)
+  }
+  
+  
+  future.apply::future_lapply(fst_files, function(fst_file) {
+    file_name = basename(fst_file)
+    file_name = paste0(stringr::str_sub(file_name, 1, nchar(file_name)-4), ".parquet")
+    path_name = normalizePath(dirname(fst_file))
+    
+    # remove base directory from path
+    path_name = stringr::str_sub(path_name, nchar(path)+1)
+    
+    if(!fs::dir_exists(file.path(outdir, path_name))) {
+      fs::dir_create(file.path(outdir, path_name))
+    }
+    
+    arrow::write_parquet(fst::read_fst(fst_file), file.path(outdir, path_name, file_name))
+  })
+}
\ No newline at end of file
diff --git a/R/dplyr_verbs.r b/R/dplyr_verbs.r
index 285c678f..49586774 100644
--- a/R/dplyr_verbs.r
+++ b/R/dplyr_verbs.r
@@ -137,6 +137,7 @@ chunk_ungroup = create_chunk_mapper(dplyr::ungroup)
 #ungroup.disk.frame( < - create_dplyr_mapper(dplyr::ungroup, , warning_msg="`ungroup.disk.frame` is now deprecated. Please use `chunk_ungroup` instead. This is in preparation for a more powerful `group_by` framework")
 
 
+
 #' @export
 #' @rdname dplyr_verbs
 glimpse.disk.frame <- function(.data, ...) {
diff --git a/R/get_chunk.r b/R/get_chunk.r
index 01c06e61..c0125985 100644
--- a/R/get_chunk.r
+++ b/R/get_chunk.r
@@ -4,6 +4,7 @@
 #' @param keep the columns to keep
 #' @param full.names whether n is the full path to the chunks or just a relative path file name. Ignored if n is numeric
 #' @param ... passed to fst::read_fst or whichever read function is used in the backend
+#' @param partitioned_info for internal use only. It's a data.frame used to help with filtering by partitions
 #' @export
 #' @examples
 #' cars.df = as.disk.frame(cars, nchunks = 2)
@@ -24,7 +25,7 @@ get_chunk <- function(...) {
 #' @rdname get_chunk
 #' @importFrom fst read_fst
 #' @export
-get_chunk.disk.frame <- function(df, n, keep = NULL, full.names = FALSE, ...) {
+get_chunk.disk.frame <- function(df, n, keep = NULL, full.names = FALSE, ..., partitioned_info=NULL) {
   stopifnot("disk.frame" %in% class(df))
   # keep_chunks = attr(df, "keep_chunks", exact=TRUE)
   
@@ -44,9 +45,10 @@ get_chunk.disk.frame <- function(df, n, keep = NULL, full.names = FALSE, ...) {
       keep1_vars = paste0(keep1, collapse = ", ")
       keep_no_good_vars = setdiff(keep, keep1) %>% paste0(collapse = ", ")
       stop(
-        glue::glue(
-          "This disk.frame has a srckeep containing these variables {keep1_vars}. 
-          You are trying to keep {keep_no_good_vars}, which are not available."))
+        sprintf(
+          "This disk.frame has a `srckeep` containing these columns: `%s`. 
+          You are trying to keep `%s`, which are not available.",
+          paste0(keep1_vars, collapse=", "), paste0(keep_no_good_vars, collapse=", ")))
     }
     keep = intersect(keep1, keep)
     if (!all(keep %in% keep1)) {
@@ -78,19 +80,37 @@ get_chunk.disk.frame <- function(df, n, keep = NULL, full.names = FALSE, ...) {
     return(notbl)
   }
 
-  
   if (is.null(recordings)) {
     if(typeof(keep)=="closure") {
-      fst::read_fst(filename, as.data.table = TRUE,...)
+      tmp = fst::read_fst(filename, as.data.table = TRUE,...)
     } else {
-      fst::read_fst(filename, columns = keep, as.data.table = TRUE,...)
+      tmp = fst::read_fst(filename, columns = keep, as.data.table = TRUE,...)
+    }
+    
+    if(!is.null(partitioned_info)) {
+      res = tmp %>% 
+        mutate(.check=1) %>% 
+        full_join(partitioned_info %>% mutate(.check=1), by=".check") %>% 
+        select(-.check, -fullpath, -.disk.frame.sub.path)
+      return(res)
+    } else{
+      return(tmp)
     }
   } else {
-    if(typeof(keep)!="closure") {
-      play(fst::read_fst(filename, as.data.table = TRUE,...), recordings)
+    if(typeof(keep)=="closure") {
+      tmp_df_input = fst::read_fst(filename, as.data.table = TRUE,...)
     } else {
-      play(fst::read_fst(filename, columns = keep, as.data.table = TRUE,...), recordings)
-      
+      tmp_df_input = fst::read_fst(filename, columns = keep, as.data.table = TRUE,...)
+    }
+    
+    if(!is.null(partitioned_info)) {
+      res = tmp_df_input %>% 
+        mutate(.check=1) %>% 
+        full_join(partitioned_info %>% mutate(.check=1), by=".check") %>% 
+        select(-.check, -fullpath, -.disk.frame.sub.path)
+      return(play(res, recordings))
+    } else{
+      return(play(tmp_df_input, recordings))
     }
   }
 }
diff --git a/R/get_chunk_ids.r b/R/get_chunk_ids.r
index bbb3890a..9a023e90 100644
--- a/R/get_chunk_ids.r
+++ b/R/get_chunk_ids.r
@@ -20,19 +20,19 @@
 #' # clean up cars.df
 #' delete(cars.df)
 get_chunk_ids <- function(df, ..., full.names = FALSE, strip_extension = TRUE) {
-  lf = list.files(attr(df,"path"), full.names = full.names, ...)
+  stopifnot("disk.frame" %in% class(df))
+  
+  lf = list.files(attr(df,"path"), full.names = full.names, ..., recursive = TRUE)
   if(full.names) {
     return(lf)
   }
-  purrr::map_chr(lf, ~{
-    tmp = stringr::str_split(.x,stringr::fixed("."), simplify = TRUE)
-    l = length(tmp)
-    if(l == 1) {
-      return(tmp)
-    } else if(strip_extension) {
-      paste0(tmp[-l], collapse="")
-    } else {
-      .x
+  
+  # strip out the path or file name if required
+  sapply(lf, function(path) {
+    tmp = basename(path)
+    if (strip_extension) {
+      tmp = tools::file_path_sans_ext(tmp)
     }
+    return(tmp)
   })
 }
diff --git a/R/get_partition.r b/R/get_partition.r
new file mode 100644
index 00000000..b43e68f7
--- /dev/null
+++ b/R/get_partition.r
@@ -0,0 +1,71 @@
+#' Turn a string of the form /partion1=val/partion2=val2 into data.frame
+#' @param path_strs The paths in string form to break into partition format
+split_string_into_df <- function(path_strs) {
+  paths = dirname(path_strs) %>% unique
+  list_of_partitions = stringr::str_split(paths, "/")
+  
+  tmp = mapply(function(partition, path) {
+    part_val = stringr::str_split(partition, "=")
+    tmp = lapply(part_val, function(part_val, lvl) {
+      tmp = data.frame(partition = part_val[2])
+      names(tmp) = part_val[1]
+      tmp
+      }) %>% 
+      do.call(cbind, .)
+    
+    tmp$.disk.frame.sub.path = path
+    
+    tmp
+  }, list_of_partitions,  paths, SIMPLIFY = FALSE) %>% data.table::rbindlist()
+  
+  tmp
+}
+
+if(F) {
+  df = disk.frame("C:/temp/ok.df") %>% 
+    filter(partition == 1)
+}
+
+#' Get the partitioning structure of a folder
+#' @param df a disk.frame whose paths will be used to determine if it's
+#'   folder-partitioned disk.frame
+#' @importFrom utils type.convert
+get_partition_paths <- function(df) {
+  stopifnot("disk.frame" %in% class(df))
+  path = tools::file_path_as_absolute(attr(df, "path"))
+  
+  allowed_paths = path
+  
+  meta_path = file.path(path, ".metadata")
+  
+  is_partitioned = FALSE
+  df_of_partitions = NULL
+  
+  # if it's a partitioned structure allow more search paths than root
+  if (length(setdiff(list.dirs(path, recursive = FALSE), meta_path)) >= 1) {
+    lf = list.files(path, full.names = FALSE, pattern="fst", recursive=TRUE)
+    
+    # create a data.frame of the paths so it can be filtered
+    df_of_partitions = split_string_into_df(lf)
+    # infer the types
+    df_of_partitions = utils::type.convert(df_of_partitions, as.is=TRUE)
+    
+    # if there is a filter operation, filter the above to figure out
+    allowed_paths = df_of_partitions$.disk.frame.sub.path
+    
+    # filter for some paths if necessary
+    partition_filter_info = attr(df, "partition_filter")
+    if (!is.null(partition_filter_info)) {
+      # apply filter
+      df_of_partitions = eval(partition_filter_info$expr, list(dataframe=df_of_partitions))
+      
+      allowed_paths = df_of_partitions$.disk.frame.sub.path
+    }
+    is_partitioned = TRUE
+  }
+  
+  # now go through the allowed paths
+  list(paths=file.path(attr(df, "path"), allowed_paths), is_partitioned=is_partitioned, df = df_of_partitions)
+  
+  # TODO check all files sit within the same structure
+}
diff --git a/R/is_disk.frame.r b/R/is_disk.frame.r
index 542bd38b..48f538da 100644
--- a/R/is_disk.frame.r
+++ b/R/is_disk.frame.r
@@ -10,24 +10,30 @@
 #' # clean up cars.df
 #' delete(cars.df)
 is_disk.frame <- function(df) {
-  ##browser
   if("disk.frame" %in% class(df)) {
     df = attr(df, "path", exact=TRUE)
   } else if(!"character" %in% class(df)) { # character then check the path
     return(FALSE)
   }
   
-  files <- fs::dir_ls(df, type="file", all  = TRUE)
+  files <- fs::dir_ls(df, type="file", all = TRUE)
   # if all files are fst
   if(length(files)>0) {
     if(any(purrr::map_lgl(files, ~length(grep(glob2rx("*.fst"), .x)) == 0))) {
-      # some of the fiels do not have a .fst extension
+      # some of the files do not have a .fst extension
       return(FALSE)
     }
   }
   
   dirs = fs::dir_ls(df, type="directory", all = TRUE)
   if(length(dirs) > 1) {
+    # are the directories of this form name=val
+    split_dirs = stringr::str_split(basename(setdiff(dirs, file.path(df, ".metadata"))), "=")
+    if (all(sapply(split_dirs, length) == 2)) {
+      # all folder are 
+      # TODO check the folder recursively
+      return(TRUE)
+    }
     return(FALSE)
   } else if(length(dirs) == 1) {
     if(substr(dirs, nchar(dirs)-8,nchar(dirs)) != ".metadata") {
diff --git a/R/names.r b/R/names.r
index 05d047da..9f4393a3 100644
--- a/R/names.r
+++ b/R/names.r
@@ -25,17 +25,17 @@ names.disk.frame <- function(x, ...) {
 #' @export
 colnames.disk.frame <- function(x, ...) {
   res = attr(x, "path", exact=TRUE) %>% 
-    list.files(full.names = TRUE)
+    list.files(full.names = TRUE, recursive=TRUE, pattern = "fst")
   
-  if(is.null(attr(x, "recordings"))) {
+  # if(is.null(attr(x, "recordings"))) {
     if(length(res) == 0) {
       return(vector("character"))
     }
     return(fst::metadata_fst(res[1])$columnNames)
-  } else {
-    tiny_example_data.frame = get_chunk(x, 1, from=1, to=1)
-    return(colnames(tiny_example_data.frame))
-  }
+  # } else {
+  #   tiny_example_data.frame = get_chunk(x, 1, from=1, to=1)
+  #   return(colnames(tiny_example_data.frame))
+  # }
 }
 
 
diff --git a/R/nchunks.r b/R/nchunks.r
index d40eb6e6..17c89888 100644
--- a/R/nchunks.r
+++ b/R/nchunks.r
@@ -36,7 +36,7 @@ nchunks.disk.frame <- function(df, skip.ready.check = FALSE, ...) {
   #if(!skip.ready.check) stopifnot(is_ready(df))
   fpath <- attr(df,"path", exact=TRUE)
   if(is.dir.disk.frame(df)) {
-    return(length(fs::dir_ls(fpath, type="file")))
+    return(length(list.files(fpath, pattern="fst", recursive = TRUE)))
   } else {
     return(1)
   }
diff --git a/R/ncol-nrow.r b/R/ncol-nrow.r
index 1addc2dc..3f9e17c2 100644
--- a/R/ncol-nrow.r
+++ b/R/ncol-nrow.r
@@ -29,7 +29,7 @@ nrow.disk.frame <- function(df, ...) {
   stopifnot(is_ready(df))
   path1 <- attr(df,"path", exact=TRUE)
   if(dir.exists(path1)) {
-    path2 <- list.files(path1,full.names = TRUE)
+    path2 <- list.files(path1, full.names = TRUE, recursive=TRUE, pattern = "fst")
     if(length(path2) == 0) {
       return(0)
     }
diff --git a/R/one-stage-verbs.R b/R/one-stage-verbs.R
index 27f62f2f..39d9b350 100644
--- a/R/one-stage-verbs.R
+++ b/R/one-stage-verbs.R
@@ -209,19 +209,31 @@ IQR_df.collected_agg.disk.frame <- function(listx, ...) {
 #' @rdname group_by
 #' @export
 summarise.grouped_disk.frame <- function(.data, ...) {
+  class(.data) <- c("summarized_disk.frame", "disk.frame")
+  
   # get all components of the summarise
   dotdotdot = rlang::enexprs(...)
   
   # convert any quosure to labels
   for (i in seq_along(dotdotdot)) {
-    if("quosure" %in% class(dotdotdot[[i]])) {
-      dotdotdot[[i]] <- rlang::sym(rlang::as_label(dotdotdot[[i]]))
-    }
+    dotdotdot[[i]] <- rlang::quo_squash(dotdotdot[[i]])
   }
   
-  class(.data) <- c("summarized_disk.frame", "disk.frame")
   attr(.data, "summarize_code") = dotdotdot
   
+  # detect any global variables
+  args_str = sapply(dotdotdot, function(code) {
+    deparse(code) %>% 
+      paste0(collapse="")
+  }) %>% paste(collapse = ", ")
+  
+  
+  attr(.data, "summarize_globals_and_pkgs") = 
+    find_globals_recursively(
+      parse(text=sprintf("list(%s)", args_str))[[1]], 
+      parent.frame()
+    )
+  
   return(.data)
 }
 
@@ -246,22 +258,28 @@ summarize.grouped_disk.frame = summarise.grouped_disk.frame
 #' @rdname group_by
 # learning from https://docs.dask.org/en/latest/dataframe-groupby.html
 group_by.disk.frame <- function(.data, ..., .add = FALSE, .drop = stop("disk.frame does not support `.drop` in `group_by` at this stage")) {
-  
   class(.data) <- c("grouped_disk.frame", "disk.frame")
   
-  # using rlang is a neccesary evil here as I need to deal with !!! that is supported by group_by etc
+  # using rlang is a necessary evil here as I need to deal with !!! that is supported by group_by etc
   group_by_cols = rlang::enexprs(...)
   
   # convert any quosure to labels
   for (i in seq_along(group_by_cols)) {
-    if("quosure" %in% class(group_by_cols[[i]])) {
-      group_by_cols[[i]] <- rlang::sym(rlang::as_label(group_by_cols[[i]]))
-    }
+    group_by_cols[[i]] <- group_by_cols[[i]] %>% 
+      rlang::quo_squash()
   }
   
-  
   attr(.data, "group_by_cols") = group_by_cols
   
+  # detect any global variables
+  args_str = sapply(group_by_cols, function(code) {
+    deparse(code) %>% 
+      paste0(collapse="")
+  }) %>% paste(collapse = ", ")
+  
+  
+  attr(.data, "group_by_globals_and_pkgs") = find_globals_recursively(parse(text=sprintf("list(%s)", args_str))[[1]], parent.frame())
+  
   .data
 }
 
@@ -270,19 +288,33 @@ group_by.disk.frame <- function(.data, ..., .add = FALSE, .drop = stop("disk.fra
 #' @importFrom dplyr summarize
 #' @rdname group_by
 summarize.disk.frame <- function(.data, ...) {
+  
+  class(.data) <- c("summarized_disk.frame", "disk.frame")
+  
   # get all components of the summarise
   dotdotdot = rlang::enexprs(...)
   
   # convert any quosure to labels
   for (i in seq_along(dotdotdot)) {
-    if("quosure" %in% class(dotdotdot[[i]])) {
-      dotdotdot[[i]] <- rlang::sym(rlang::as_label(dotdotdot[[i]]))
-    }
+    dotdotdot[[i]] <- dotdotdot[[i]] %>% 
+      rlang::quo_squash()
   }
   
-  class(.data) <- c("summarized_disk.frame", "disk.frame")
   attr(.data, "summarize_code") = dotdotdot
   
+  # detect any global variables
+  args_str = sapply(dotdotdot, function(code) {
+    deparse(code) %>% 
+      paste0(collapse="")
+  }) %>% paste(collapse = ", ")
+  
+  
+  attr(.data, "summarize_globals_and_pkgs") = 
+    find_globals_recursively(
+      parse(text=sprintf("list(%s)", args_str))[[1]], 
+      parent.frame()
+    )
+  
   return(.data)
 }
 
diff --git a/R/overwrite_check.r b/R/overwrite_check.r
index f0651315..eabcf92b 100644
--- a/R/overwrite_check.r
+++ b/R/overwrite_check.r
@@ -15,7 +15,6 @@
 #' # clean up
 #' fs::dir_delete(tf)
 overwrite_check <- function(outdir, overwrite) {
-  ##browser
   if (is.null(outdir)) {
     warning("outdir is NULL; no overwrite check is performed")
     return(NULL)
@@ -30,7 +29,7 @@ overwrite_check <- function(outdir, overwrite) {
         fs::dir_delete(outdir)
       }, error = function(e) {
         message(e)
-        stop(glue::glue("Failed to delete the directory {outdir} in preparation for overwrite, this could be due to many reason and may be a genuine bug. Firstly, though, please ensure you do not have the folder open by Explorer (Windows) or other file management systems"))
+        stop(glue::glue("Failed to delete the directory {outdir} in preparation for overwrite, this could be due to many reasons and may be a genuine bug. Firstly, though, please ensure you do not have the folder open by Explorer (Windows) or other file management systems"))
       })
     }
     
diff --git a/R/partition-filter.r b/R/partition-filter.r
new file mode 100644
index 00000000..6d754f2e
--- /dev/null
+++ b/R/partition-filter.r
@@ -0,0 +1,13 @@
+#' Filter the dataset based on folder partitions
+#' @param x a disk.frame
+#' @param ... filtering conditions for filtering the disk.frame at (folder) partition level
+#' @importFrom dplyr filter
+#' @export
+partition_filter <- function(x, ...) {
+  expr = bquote(dplyr::filter(dataframe, .(substitute(...))))
+  globals = find_globals_recursively(expr, parent.frame())
+  
+  attr(x, "partition_filter") = list(expr=expr, globals=globals)
+  
+  return(x)
+}
diff --git a/R/recommend_nchunks.r b/R/recommend_nchunks.r
index 2b987600..bb59aec4 100644
--- a/R/recommend_nchunks.r
+++ b/R/recommend_nchunks.r
@@ -1,3 +1,4 @@
+
 #' Recommend number of chunks based on input size
 #' @description Computes the recommended number of chunks to break a data.frame
 #' into. It can accept filesizes in bytes (as integer) or a data.frame
@@ -22,7 +23,6 @@
 #' # recommend nchunks based on file size ONLY CSV is implemented at the moment
 #' recommend_nchunks(1024^3)
 recommend_nchunks <- function(df, type = "csv", minchunks = data.table::getDTthreads(), conservatism = 8, ram_size = df_ram_size()) {
-  
   dfsize = 0
   if ("data.frame" %in% class(df)) {
     # the df's size in gigabytes
@@ -55,7 +55,6 @@ recommend_nchunks <- function(df, type = "csv", minchunks = data.table::getDTthr
 #' # returns the RAM size in gigabyte (GB)
 #' df_ram_size() 
 df_ram_size <- function() {
-  #browser()
   tryCatch({
     ram_size = NULL
     # the amount of memory available in gigabytes
@@ -119,10 +118,10 @@ df_ram_size <- function() {
       ram_size = benchmarkme::get_ram()/1024^3
       
       if(is.na(ram_size)) {
-        warning("RAM size can't be determined. Assume you have 16GB of RAM.")
-        warning("Please report this error at github.com/xiaodaigh/disk.frame/issues")
-        warning(glue::glue("Please include your operating system, R version, and if using RStudio the Rstudio version number"))
-        return(16)
+          warning("RAM size can't be determined. Assume you have 16GB of RAM.")
+          warning("Please report this error at github.com/xiaodaigh/disk.frame/issues")
+          warning(glue::glue("Please include your operating system, R version, and if using RStudio the Rstudio version number"))
+          return(16)
       } else {
         ram_size = max(ram_size, 1, na.rm = TRUE)
         return(ram_size)
diff --git a/R/util.r b/R/util.r
index 8b760f0f..03987c4b 100644
--- a/R/util.r
+++ b/R/util.r
@@ -45,3 +45,36 @@ purrr_as_mapper <- function(.f) {
   }
   return(.f)
 }
+
+#' Find globals in an expression by searching through the chain
+#' @param code An expression to search for globals
+#' @param envir The environment from which to begin the search
+find_globals_recursively <- function(code, envir) {
+  globals_and_pkgs = future::getGlobalsAndPackages(code, envir)
+  
+  global_vars = globals_and_pkgs$globals
+  
+  done = identical(envir, emptyenv()) || identical(envir, globalenv())
+  
+  if(done) return(globals_and_pkgs)
+  
+  env = parent.env(envir)
+  
+  done = identical(env, emptyenv()) || identical(env, globalenv())
+  
+  # keep adding global variables by moving up the environment chain
+  while(!done) {
+    tmp_globals_and_pkgs = future::getGlobalsAndPackages(code, envir = env)
+    new_global_vars = tmp_globals_and_pkgs$globals
+    for (name in setdiff(names(new_global_vars), names(global_vars))) {
+      global_vars[[name]] <- new_global_vars[[name]]
+    }
+    
+    done = identical(env, emptyenv()) || identical(env, globalenv())
+    env = parent.env(env)
+  }
+  
+  globals_and_pkgs$globals = global_vars
+  
+  return(globals_and_pkgs)
+}
\ No newline at end of file
diff --git a/R/write_disk.frame.r b/R/write_disk.frame.r
index cdfd0c36..7d38e0c4 100644
--- a/R/write_disk.frame.r
+++ b/R/write_disk.frame.r
@@ -7,13 +7,12 @@
 #' @param nchunks number of chunks
 #' @param overwrite overwrite output directory
 #' @param shardby the columns to shard by
+#' @param partitionby the columns to (folder) partition by
 #' @param compress compression ratio for fst files
-#' @param shardby_function splitting of chunks: "hash" for hash function or "sort" for semi-sorted chunks
-#' @param sort_splits for the "sort" shardby function, a dataframe with the split values.
-#' @param desc_vars for the "sort" shardby function, the variables to sort descending. 
 #' @param ... passed to cmap.disk.frame
 #' @export
 #' @import fst fs
+#' @importFrom dplyr group_map
 #' @importFrom glue glue
 #' @examples
 #' cars.df = as.disk.frame(cars)
@@ -33,18 +32,67 @@ write_disk.frame <- function(
     nchunks.disk.frame(diskf),
     recommend_nchunks(diskf)),
   overwrite = FALSE,
-  shardby=NULL, compress = 50, shardby_function="hash", sort_splits=NULL, desc_vars=NULL, ...) {
+  shardby=NULL, 
+  partitionby=NULL,
+  compress = 50, ...) {
 
   force(nchunks)
   overwrite_check(outdir, overwrite)
 
-
   if(is.null(outdir)) {
     stop("write_disk.frame error: outdir must not be NULL")
   }
 
   if(is_disk.frame(diskf)) {
-    if(is.null(shardby)) {
+    if(!is.null(partitionby)) {
+      
+      # for each chunk group by the partionby and then write out a partitioned disk.frame for each chunk
+      list_of_paths = diskf %>%
+        cimap(~{
+          tmp_dir_to_write = tempfile(as.character(.y))
+          tmp = .x %>%
+            group_by(!!!syms(partitionby)) %>%
+            dplyr::group_map(~{
+              # convert group keys to path
+              tmp_path = lapply(names(.y), function(n) {
+                sprintf("%s=%s", n, .y[, n])
+              }) %>%
+                do.call(file.path, .)
+              
+              final_tmp_path = file.path(tmp_dir_to_write, tmp_path)
+              as.disk.frame(.x, final_tmp_path, overwrite = FALSE)
+            })
+          return(tmp_dir_to_write)
+        }, lazy=FALSE)
+      
+      # for each of the chunks, do a soft row-append
+      partitioned_files = lapply(list_of_paths, function(path) {
+        # each path is a partitioned disk.frame
+        files = list.files(path, full.names = TRUE, recursive=TRUE)
+        tmp = data.frame(partition_path =  files %>% 
+          dirname %>% 
+          sapply(tools::file_path_as_absolute) %>% 
+          stringr::str_sub(nchar(path)+2))
+        tmp = tmp %>% mutate(path=path, files=files)
+        
+        tmp
+      }) %>% rbindlist
+      
+      partitioned_files %>% 
+        group_by(partition_path) %>% 
+        dplyr::group_map(function(df, grp) {
+          mapply(function(file, i) {
+            outfile = file.path(outdir, grp$partition_path, paste0(i, ".fst"))
+            if(!dir.exists(file.path(outdir, grp$partition_path))) {
+              fs::dir_create(file.path(outdir, grp$partition_path))
+            }
+            fs::file_move(file, outfile)
+          }, df$files, seq_along(df$files))
+        })
+        
+      return(disk.frame(outdir))
+        
+    } else if(is.null(shardby)) {
       path = attr(diskf, "path")
       files_shortname <- list.files(path)
       cids = get_chunk_ids(diskf, full.names = T, strip_extension = F)
@@ -69,8 +117,6 @@ write_disk.frame <- function(
             overwrite = TRUE,
             shardby = shardby,
             compress = compress,
-            shardby_function=shardby_function, 
-            sort_splits=sort_splits, desc_vars=desc_vars,
             ...
             )
     }
diff --git a/R/zzz.r b/R/zzz.r
index 7c82cd92..02b5febf 100644
--- a/R/zzz.r
+++ b/R/zzz.r
@@ -5,15 +5,35 @@
 #' @importFrom future nbrOfWorkers
 #' @importFrom crayon red blue green
 .onAttach <- function(libname, pkgname) {
-  #setup_disk.frame()
-  
   packageStartupMessage(
-    crayon::red(
+  crayon::blue(
+"\n\n Thank you for using {disk.frame}. However {disk.frame} has been soft-deprecated. You are recommended to switch over to using the {arrow} package for handling larger-than-RAM data. You can convert your existing disk.frames to the parquet format which {arrow} can use by using:\n\n
+```
+disk.frame::disk.frame_to_parquet(path.to.your.disk.frame, parquet_path)
+````
+
+Once done you can use {arrow}'s dataset features to manipulate the larger-than-RAM data using dplyr verbs. E.g.
+
+```
+dataset = arrow::open_dataset(parquet_path)
+
+parquet_path |>
+  mutate(...) |>
+  group_by(...) |>
+  summarize(...) |>
+  collect(...)
+```
+"
+  ),
+  crayon::red(
     glue::glue(
-    "\n\n## Message from disk.frame:
+      "\n\n## Message from disk.frame:
 We have {future::nbrOfWorkers()} workers to use with disk.frame.
-To change that, use setup_disk.frame(workers = n) or just setup_disk.frame() to use the defaults.")),
-    crayon::green("\n\n
+To change that, use setup_disk.frame(workers = n) or just setup_disk.frame() to use the defaults."
+    )
+  ),
+  crayon::green(
+    "\n\n
 It is recommended that you run the following immediately to set up disk.frame with multiple workers in order to parallelize your operations:\n\n
 ```r
 # this will set up disk.frame with multiple workers
@@ -21,7 +41,8 @@ setup_disk.frame()
 # this will allow unlimited amount of data to be passed from worker to worker
 options(future.globals.maxSize = Inf)
 ```
-\n\n"))
+\n\n"
+  ))
 }
 
 globalVariables(c(
@@ -58,7 +79,13 @@ globalVariables(c(
                   "pathB",
                   "w",
                   "xid",
-                  "yid"))
+                  "yid",
+                  "paths",
+                  ".disk.frame.sub.path",
+                  "fullpath",
+                  ".check", 
+                  "partition_path"
+                  ))
 
 #' @useDynLib disk.frame
 #' @importFrom Rcpp evalCpp
diff --git a/cran-comments.md b/cran-comments.md
index 05b563d8..245c6d74 100644
--- a/cran-comments.md
+++ b/cran-comments.md
@@ -1,14 +1,12 @@
-## Resubmission 1 for v0.6.0
-* removed the vignette builder which should remove the 1 NOTE
-
-## Submission for v0.6.0
-*  Updated the API for many functions leading to semver update of functions
+## Submission for v0.7
+* Implemented partition by folder
+* Updated R version to v4.0
 
 ## Test environments
 * local Windows 11 Pro install, R 4.1.2
-* local Windows 11 Pro install, R devel (as of 2022-01-31)
+* local Windows 11 Pro install, R devel (as of 2022-02-01)
 * local Linux/Ubuntu install, R 4.1.2
-* local Linux/Ubuntu install, R devel (as of 2022-01-31)
+* local Linux/Ubuntu install, R devel (as of 2022-02-01)
 
 ## R CMD check results
 There were no ERRORs nor WARNINGs nor NOTE when run locally.
diff --git a/docs/404.html b/docs/404.html
index 9ccac74e..49d167f0 100644
--- a/docs/404.html
+++ b/docs/404.html
@@ -32,7 +32,7 @@
       </button>
       <span class="navbar-brand">
         <a class="navbar-link" href="index.html">disk.frame</a>
-        <span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="">0.6.0</span>
+        <span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="">0.7</span>
       </span>
     </div>
 
diff --git a/docs/LICENSE-text.html b/docs/LICENSE-text.html
index 5757cc4b..d3b2e167 100644
--- a/docs/LICENSE-text.html
+++ b/docs/LICENSE-text.html
@@ -17,7 +17,7 @@
       </button>
       <span class="navbar-brand">
         <a class="navbar-link" href="index.html">disk.frame</a>
-        <span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="">0.6.0</span>
+        <span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="">0.7</span>
       </span>
     </div>
 
diff --git a/docs/articles/01-intro.html b/docs/articles/01-intro.html
index af353427..35e9a2b4 100644
--- a/docs/articles/01-intro.html
+++ b/docs/articles/01-intro.html
@@ -33,7 +33,7 @@
       </button>
       <span class="navbar-brand">
         <a class="navbar-link" href="../index.html">disk.frame</a>
-        <span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="">0.6.0</span>
+        <span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="">0.7</span>
       </span>
     </div>
 
diff --git a/docs/articles/02-intro-disk-frame.html b/docs/articles/02-intro-disk-frame.html
index 5526ed10..adfe01a1 100644
--- a/docs/articles/02-intro-disk-frame.html
+++ b/docs/articles/02-intro-disk-frame.html
@@ -33,7 +33,7 @@
       </button>
       <span class="navbar-brand">
         <a class="navbar-link" href="../index.html">disk.frame</a>
-        <span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="">0.6.0</span>
+        <span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="">0.7</span>
       </span>
     </div>
 
@@ -555,26 +555,26 @@ <h3 id="sampling">Sampling<a class="anchor" aria-label="anchor" href="#sampling"
 <div class="sourceCode" id="cb25"><pre class="downlit sourceCode r">
 <code class="sourceCode R"><span class="va">flights.df</span> <span class="op"><a href="https://magrittr.tidyverse.org/reference/pipe.html" class="external-link">%&gt;%</a></span> <span class="fu"><a href="https://dplyr.tidyverse.org/reference/sample_n.html" class="external-link">sample_frac</a></span><span class="op">(</span><span class="fl">0.01</span><span class="op">)</span> <span class="op"><a href="https://magrittr.tidyverse.org/reference/pipe.html" class="external-link">%&gt;%</a></span> <span class="va">collect</span> <span class="op"><a href="https://magrittr.tidyverse.org/reference/pipe.html" class="external-link">%&gt;%</a></span> <span class="va">head</span>
 <span class="co">#&gt;    year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time</span>
-<span class="co">#&gt; 1: 2013     1  10     1614           1605         9     1926           1944</span>
-<span class="co">#&gt; 2: 2013     5  13     1136           1136         0     1225           1234</span>
-<span class="co">#&gt; 3: 2013     5  16     1428           1420         8     1535           1538</span>
-<span class="co">#&gt; 4: 2013    12  29     1536           1520        16     1817           1750</span>
-<span class="co">#&gt; 5: 2013    12  21      849            820        29     1315           1345</span>
-<span class="co">#&gt; 6: 2013    12  25     1624           1559        25     1846           1825</span>
+<span class="co">#&gt; 1: 2013     5  24     1452           1455        -3     1840           1820</span>
+<span class="co">#&gt; 2: 2013     1  10      939            905        34     1259           1235</span>
+<span class="co">#&gt; 3: 2013     2   4     1911           1915        -4     2117           2111</span>
+<span class="co">#&gt; 4: 2013     5  12     1634           1545        49     1940           1916</span>
+<span class="co">#&gt; 5: 2013     1  16     2315           2250        25       40              7</span>
+<span class="co">#&gt; 6: 2013    12  28      913            900        13     1215           1227</span>
 <span class="co">#&gt;    arr_delay carrier flight tailnum origin dest air_time distance hour minute</span>
-<span class="co">#&gt; 1:       -18      DL   1508  N952DL    JFK  RSW      163     1074   16      5</span>
-<span class="co">#&gt; 2:        -9      EV   3830  N13955    EWR  PVD       35      160   11     36</span>
-<span class="co">#&gt; 3:        -3      EV   4284  N11536    EWR  ROC       45      246   14     20</span>
-<span class="co">#&gt; 4:        27      MQ   3553  N520MQ    LGA  XNA      181     1147   15     20</span>
-<span class="co">#&gt; 5:       -30      DL    454  N682DA    JFK  STT      188     1623    8     20</span>
-<span class="co">#&gt; 6:        21      EV   5567  N870AS    LGA  CAE       99      617   15     59</span>
+<span class="co">#&gt; 1:        20      AA   1769  N372AA    JFK  MIA      165     1089   14     55</span>
+<span class="co">#&gt; 2:        24      VX    407  N622VA    JFK  LAX      349     2475    9      5</span>
+<span class="co">#&gt; 3:         6      9E   3525  N903XJ    JFK  ORD      127      740   19     15</span>
+<span class="co">#&gt; 4:        24      DL   1773  N3739P    JFK  SLC      262     1990   15     45</span>
+<span class="co">#&gt; 5:        33      B6     30  N187JB    JFK  ROC       59      264   22     50</span>
+<span class="co">#&gt; 6:       -12      DL    422  N713TW    JFK  LAX      327     2475    9      0</span>
 <span class="co">#&gt;              time_hour</span>
-<span class="co">#&gt; 1: 2013-01-10 21:00:00</span>
-<span class="co">#&gt; 2: 2013-05-13 15:00:00</span>
-<span class="co">#&gt; 3: 2013-05-16 18:00:00</span>
-<span class="co">#&gt; 4: 2013-12-29 20:00:00</span>
-<span class="co">#&gt; 5: 2013-12-21 13:00:00</span>
-<span class="co">#&gt; 6: 2013-12-25 20:00:00</span></code></pre></div>
+<span class="co">#&gt; 1: 2013-05-24 18:00:00</span>
+<span class="co">#&gt; 2: 2013-01-10 14:00:00</span>
+<span class="co">#&gt; 3: 2013-02-05 00:00:00</span>
+<span class="co">#&gt; 4: 2013-05-12 19:00:00</span>
+<span class="co">#&gt; 5: 2013-01-17 03:00:00</span>
+<span class="co">#&gt; 6: 2013-12-28 14:00:00</span></code></pre></div>
 </div>
 <div class="section level3">
 <h3 id="writing-data">Writing Data<a class="anchor" aria-label="anchor" href="#writing-data"></a>
diff --git a/docs/articles/03-concepts.html b/docs/articles/03-concepts.html
index 5797c66c..95c0d134 100644
--- a/docs/articles/03-concepts.html
+++ b/docs/articles/03-concepts.html
@@ -33,7 +33,7 @@
       </button>
       <span class="navbar-brand">
         <a class="navbar-link" href="../index.html">disk.frame</a>
-        <span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="">0.6.0</span>
+        <span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="">0.7</span>
       </span>
     </div>
 
diff --git a/docs/articles/04-ingesting-data.html b/docs/articles/04-ingesting-data.html
index d9e14783..fb122f5d 100644
--- a/docs/articles/04-ingesting-data.html
+++ b/docs/articles/04-ingesting-data.html
@@ -33,7 +33,7 @@
       </button>
       <span class="navbar-brand">
         <a class="navbar-link" href="../index.html">disk.frame</a>
-        <span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="">0.6.0</span>
+        <span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="">0.7</span>
       </span>
     </div>
 
diff --git a/docs/articles/05-data-table-syntax.html b/docs/articles/05-data-table-syntax.html
index 7027aebf..3c67ccc2 100644
--- a/docs/articles/05-data-table-syntax.html
+++ b/docs/articles/05-data-table-syntax.html
@@ -33,7 +33,7 @@
       </button>
       <span class="navbar-brand">
         <a class="navbar-link" href="../index.html">disk.frame</a>
-        <span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="">0.6.0</span>
+        <span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="">0.7</span>
       </span>
     </div>
 
diff --git a/docs/articles/06-vs-dask-juliadb.html b/docs/articles/06-vs-dask-juliadb.html
index fe2bcd28..e8652bab 100644
--- a/docs/articles/06-vs-dask-juliadb.html
+++ b/docs/articles/06-vs-dask-juliadb.html
@@ -33,7 +33,7 @@
       </button>
       <span class="navbar-brand">
         <a class="navbar-link" href="../index.html">disk.frame</a>
-        <span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="">0.6.0</span>
+        <span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="">0.7</span>
       </span>
     </div>
 
@@ -189,14 +189,14 @@ <h5 id="disk-frame">disk.frame<a class="anchor" aria-label="anchor" href="#disk-
 <span class="fu"><a href="https://rdrr.io/r/base/system.time.html" class="external-link">system.time</a></span><span class="op">(</span><span class="fu"><a href="../reference/setup_disk.frame.html">setup_disk.frame</a></span><span class="op">(</span><span class="op">)</span><span class="op">)</span> <span class="co"># ~4s</span>
 <span class="co">#&gt; The number of workers available for disk.frame is 6</span>
 <span class="co">#&gt;    user  system elapsed </span>
-<span class="co">#&gt;    0.20    0.03    2.32</span></code></pre></div>
+<span class="co">#&gt;    0.15    0.04    2.41</span></code></pre></div>
 <p>We note that there is some time needed for disk.frame to start up all the workers. Next we try to convert the largest CSV file to disk.frame format. The file to be converted is about 2.2GB in size</p>
 <div class="sourceCode" id="cb6"><pre class="downlit sourceCode r">
 <code class="sourceCode R"><span class="va">time_to_convert_disk.frame</span> <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/system.time.html" class="external-link">system.time</a></span><span class="op">(</span><span class="va">df1</span> <span class="op">&lt;-</span> <span class="fu"><a href="../reference/csv_to_disk.frame.html">csv_to_disk.frame</a></span><span class="op">(</span><span class="st">"c:/data/Performance_2004Q3.txt"</span>, header <span class="op">=</span> <span class="cn">FALSE</span><span class="op">)</span><span class="op">)</span><span class="op">[</span><span class="fl">3</span><span class="op">]</span>
 
 <span class="va">time_to_convert_disk.frame</span>
 <span class="co">#&gt; elapsed </span>
-<span class="co">#&gt;    28.3</span></code></pre></div>
+<span class="co">#&gt;   29.67</span></code></pre></div>
 <p>Now that we have converted it, we want to a count by the first column. To achieve this we use a “two-stage” aggregation strategy. Note that use <code>keep="V1"</code> to bring only the column <code>V1</code> into RAM. This avoids the reading of other unnecessary columns and should speed-up the analysis significantly</p>
 <div class="sourceCode" id="cb7"><pre class="downlit sourceCode r">
 <code class="sourceCode R"><span class="va">time_to_agg_disk.frame</span> <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/system.time.html" class="external-link">system.time</a></span><span class="op">(</span><span class="va">summ</span> <span class="op">&lt;-</span> <span class="va">df1</span><span class="op">[</span>,<span class="va">.N</span>, <span class="va">V1</span>, keep <span class="op">=</span> <span class="st">"V1"</span><span class="op">]</span><span class="op">[</span>, <span class="fu">.</span><span class="op">(</span>N <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/sum.html" class="external-link">sum</a></span><span class="op">(</span><span class="va">N</span><span class="op">)</span><span class="op">)</span>, <span class="va">V1</span><span class="op">]</span><span class="op">)</span>
@@ -204,7 +204,7 @@ <h5 id="disk-frame">disk.frame<a class="anchor" aria-label="anchor" href="#disk-
 
 <span class="va">time_to_agg_disk.frame</span>
 <span class="co">#&gt;    user  system elapsed </span>
-<span class="co">#&gt;    0.13    0.03    8.48</span></code></pre></div>
+<span class="co">#&gt;    0.08    0.01    8.82</span></code></pre></div>
 <p>We can inspect the result as well.</p>
 <div class="sourceCode" id="cb8"><pre class="downlit sourceCode r">
 <code class="sourceCode R"><span class="va">summ</span>
@@ -228,7 +228,7 @@ <h5 id="disk-frame">disk.frame<a class="anchor" aria-label="anchor" href="#disk-
   <span class="fu"><a href="https://dplyr.tidyverse.org/reference/summarise.html" class="external-link">summarise</a></span><span class="op">(</span>N <span class="op">=</span> <span class="fu"><a href="https://dplyr.tidyverse.org/reference/context.html" class="external-link">n</a></span><span class="op">(</span><span class="op">)</span><span class="op">)</span> <span class="op"><a href="https://magrittr.tidyverse.org/reference/pipe.html" class="external-link">%&gt;%</a></span> 
   <span class="va">collect</span><span class="op">)</span>
 <span class="co">#&gt;    user  system elapsed </span>
-<span class="co">#&gt;    1.53    0.17   10.25</span></code></pre></div>
+<span class="co">#&gt;    1.64    0.13    4.43</span></code></pre></div>
 <p>However, the <code>dplyr</code> syntax tends to be slightly slower than using data.table syntax. This may be improved as much of the overhead is due to inefficient use of NSE.</p>
 </div>
 <div class="section level5">
diff --git a/docs/articles/07-glm.html b/docs/articles/07-glm.html
index 2b340f3c..01e5f05e 100644
--- a/docs/articles/07-glm.html
+++ b/docs/articles/07-glm.html
@@ -33,7 +33,7 @@
       </button>
       <span class="navbar-brand">
         <a class="navbar-link" href="../index.html">disk.frame</a>
-        <span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="">0.6.0</span>
+        <span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="">0.7</span>
       </span>
     </div>
 
diff --git a/docs/articles/08-more-epic.html b/docs/articles/08-more-epic.html
index f4170f94..a15ed651 100644
--- a/docs/articles/08-more-epic.html
+++ b/docs/articles/08-more-epic.html
@@ -33,7 +33,7 @@
       </button>
       <span class="navbar-brand">
         <a class="navbar-link" href="../index.html">disk.frame</a>
-        <span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="">0.6.0</span>
+        <span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="">0.7</span>
       </span>
     </div>
 
diff --git a/docs/articles/09-convenience-features.html b/docs/articles/09-convenience-features.html
index ef7bf2ed..9e2bcfc3 100644
--- a/docs/articles/09-convenience-features.html
+++ b/docs/articles/09-convenience-features.html
@@ -33,7 +33,7 @@
       </button>
       <span class="navbar-brand">
         <a class="navbar-link" href="../index.html">disk.frame</a>
-        <span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="">0.6.0</span>
+        <span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="">0.7</span>
       </span>
     </div>
 
diff --git a/docs/articles/10-group-by.html b/docs/articles/10-group-by.html
index f18f3b51..eaa55ced 100644
--- a/docs/articles/10-group-by.html
+++ b/docs/articles/10-group-by.html
@@ -33,7 +33,7 @@
       </button>
       <span class="navbar-brand">
         <a class="navbar-link" href="../index.html">disk.frame</a>
-        <span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="">0.6.0</span>
+        <span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="">0.7</span>
       </span>
     </div>
 
diff --git a/docs/articles/11-custom-group-by.html b/docs/articles/11-custom-group-by.html
index 695f467f..8ca8d702 100644
--- a/docs/articles/11-custom-group-by.html
+++ b/docs/articles/11-custom-group-by.html
@@ -33,7 +33,7 @@
       </button>
       <span class="navbar-brand">
         <a class="navbar-link" href="../index.html">disk.frame</a>
-        <span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="">0.6.0</span>
+        <span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="">0.7</span>
       </span>
     </div>
 
diff --git a/docs/articles/88-trouble-shooting.html b/docs/articles/88-trouble-shooting.html
index e1ba2f47..cb894f3d 100644
--- a/docs/articles/88-trouble-shooting.html
+++ b/docs/articles/88-trouble-shooting.html
@@ -33,7 +33,7 @@
       </button>
       <span class="navbar-brand">
         <a class="navbar-link" href="../index.html">disk.frame</a>
-        <span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="">0.6.0</span>
+        <span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="">0.7</span>
       </span>
     </div>
 
diff --git a/docs/articles/index.html b/docs/articles/index.html
index 3e4cba20..ab7832cd 100644
--- a/docs/articles/index.html
+++ b/docs/articles/index.html
@@ -17,7 +17,7 @@
       </button>
       <span class="navbar-brand">
         <a class="navbar-link" href="../index.html">disk.frame</a>
-        <span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="">0.6.0</span>
+        <span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="">0.7</span>
       </span>
     </div>
 
diff --git a/docs/authors.html b/docs/authors.html
index 97226196..7c8e7588 100644
--- a/docs/authors.html
+++ b/docs/authors.html
@@ -17,7 +17,7 @@
       </button>
       <span class="navbar-brand">
         <a class="navbar-link" href="index.html">disk.frame</a>
-        <span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="">0.6.0</span>
+        <span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="">0.7</span>
       </span>
     </div>
 
@@ -110,13 +110,13 @@ <h1 id="citation">Citation</h1>
 
     <p>ZJ D (2022).
 <em>disk.frame: Larger-than-RAM Disk-Based Data Manipulation Framework</em>.
-R package version 0.6.0, <a href="https://diskframe.com" class="external-link">https://diskframe.com</a>. 
+R package version 0.7, <a href="https://diskframe.com" class="external-link">https://diskframe.com</a>. 
 </p>
     <pre>@Manual{,
   title = {disk.frame: Larger-than-RAM Disk-Based Data Manipulation Framework},
   author = {Dai ZJ},
   year = {2022},
-  note = {R package version 0.6.0},
+  note = {R package version 0.7},
   url = {https://diskframe.com},
 }</pre>
 
diff --git a/docs/index.html b/docs/index.html
index 7ea2fc4e..f86e68c5 100644
--- a/docs/index.html
+++ b/docs/index.html
@@ -37,7 +37,7 @@
       </button>
       <span class="navbar-brand">
         <a class="navbar-link" href="index.html">disk.frame</a>
-        <span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="">0.6.0</span>
+        <span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="">0.7</span>
       </span>
     </div>
 
@@ -228,6 +228,7 @@ <h2 id="quick-start">Quick-start<a class="anchor" aria-label="anchor" href="#qui
 
 <span class="co"># this will setup disk.frame's parallel backend with number of workers equal to the number of CPU cores (hyper-threaded cores are counted as one not two)</span>
 <span class="fu"><a href="reference/setup_disk.frame.html">setup_disk.frame</a></span><span class="op">(</span><span class="op">)</span>
+<span class="co">#&gt; The number of workers available for disk.frame is 6</span>
 <span class="co"># this allows large datasets to be transferred between sessions</span>
 <span class="fu"><a href="https://rdrr.io/r/base/options.html" class="external-link">options</a></span><span class="op">(</span>future.globals.maxSize <span class="op">=</span> <span class="cn">Inf</span><span class="op">)</span>
 
@@ -247,12 +248,12 @@ <h3 id="dplyr-verbs">dplyr verbs<a class="anchor" aria-label="anchor" href="#dpl
   <span class="fu"><a href="https://dplyr.tidyverse.org/reference/filter.html" class="external-link">filter</a></span><span class="op">(</span><span class="va">year</span> <span class="op">==</span> <span class="fl">2013</span><span class="op">)</span> <span class="op"><a href="https://magrittr.tidyverse.org/reference/pipe.html" class="external-link">%&gt;%</a></span> 
   <span class="fu"><a href="https://dplyr.tidyverse.org/reference/mutate.html" class="external-link">mutate</a></span><span class="op">(</span>origin_dest <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/paste.html" class="external-link">paste0</a></span><span class="op">(</span><span class="va">origin</span>, <span class="va">dest</span><span class="op">)</span><span class="op">)</span> <span class="op"><a href="https://magrittr.tidyverse.org/reference/pipe.html" class="external-link">%&gt;%</a></span> 
   <span class="fu"><a href="https://rdrr.io/r/utils/head.html" class="external-link">head</a></span><span class="op">(</span><span class="fl">2</span><span class="op">)</span>
-<span class="co">#&gt;    year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time</span>
-<span class="co">#&gt; 1: 2013     1   1      517            515         2      830            819</span>
-<span class="co">#&gt; 2: 2013     1   1      533            529         4      850            830</span>
-<span class="co">#&gt;    arr_delay carrier flight tailnum origin dest air_time distance hour minute</span>
-<span class="co">#&gt; 1:        11      UA   1545  N14228    EWR  IAH      227     1400    5     15</span>
-<span class="co">#&gt; 2:        20      UA   1714  N24211    LGA  IAH      227     1416    5     29</span>
+<span class="co">#&gt;    year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time arr_delay</span>
+<span class="co">#&gt; 1: 2013     1   1      517            515         2      830            819        11</span>
+<span class="co">#&gt; 2: 2013     1   1      533            529         4      850            830        20</span>
+<span class="co">#&gt;    carrier flight tailnum origin dest air_time distance hour minute</span>
+<span class="co">#&gt; 1:      UA   1545  N14228    EWR  IAH      227     1400    5     15</span>
+<span class="co">#&gt; 2:      UA   1714  N24211    LGA  IAH      227     1416    5     29</span>
 <span class="co">#&gt;              time_hour origin_dest</span>
 <span class="co">#&gt; 1: 2013-01-01 05:00:00      EWRIAH</span>
 <span class="co">#&gt; 2: 2013-01-01 05:00:00      LGAIAH</span></code></pre></div>
@@ -380,6 +381,7 @@ <h2 id="example-datatable-syntax">Example: data.table syntax<a class="anchor" ar
       <span class="fu">.</span><span class="op">(</span>qtr <span class="op">=</span> <span class="fu"><a href="https://Rdatatable.gitlab.io/data.table/reference/fifelse.html" class="external-link">ifelse</a></span><span class="op">(</span><span class="va">month</span> <span class="op">&lt;=</span> <span class="fl">3</span>, <span class="st">"Q1"</span>, <span class="st">"Q2"</span><span class="op">)</span><span class="op">)</span>
       <span class="op">]</span>
 <span class="op">)</span>
+<span class="co">#&gt; data.table syntax for disk.frame may be moved to a separate package in the future</span>
 
 <span class="va">grp_by_stage1</span>
 <span class="co">#&gt;    qtr sum_dist</span>
@@ -405,24 +407,24 @@ <h2 id="basic-info">Basic info<a class="anchor" aria-label="anchor" href="#basic
 <div class="sourceCode" id="cb10"><pre class="downlit sourceCode r">
 <code class="sourceCode R"><span class="co"># where is the disk.frame stored</span>
 <span class="fu"><a href="https://rdrr.io/r/base/attr.html" class="external-link">attr</a></span><span class="op">(</span><span class="va">flights.df</span>, <span class="st">"path"</span><span class="op">)</span>
-<span class="co">#&gt; [1] "C:\\Users\\RTX2080\\AppData\\Local\\Temp\\RtmpQH7obF\\file42d452c32907.df"</span></code></pre></div>
+<span class="co">#&gt; [1] "C:\\Users\\RTX2080\\AppData\\Local\\Temp\\RtmpQJt0m9\\file393027da33b.df"</span></code></pre></div>
 <p>A number of data.frame functions are implemented for disk.frame</p>
 <div class="sourceCode" id="cb11"><pre class="downlit sourceCode r">
 <code class="sourceCode R"><span class="co"># get first few rows</span>
 <span class="fu"><a href="https://rdrr.io/r/utils/head.html" class="external-link">head</a></span><span class="op">(</span><span class="va">flights.df</span>, <span class="fl">1</span><span class="op">)</span>
-<span class="co">#&gt;    year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time</span>
-<span class="co">#&gt; 1: 2013     1   1      517            515         2      830            819</span>
-<span class="co">#&gt;    arr_delay carrier flight tailnum origin dest air_time distance hour minute</span>
-<span class="co">#&gt; 1:        11      UA   1545  N14228    EWR  IAH      227     1400    5     15</span>
+<span class="co">#&gt;    year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time arr_delay</span>
+<span class="co">#&gt; 1: 2013     1   1      517            515         2      830            819        11</span>
+<span class="co">#&gt;    carrier flight tailnum origin dest air_time distance hour minute</span>
+<span class="co">#&gt; 1:      UA   1545  N14228    EWR  IAH      227     1400    5     15</span>
 <span class="co">#&gt;              time_hour</span>
 <span class="co">#&gt; 1: 2013-01-01 05:00:00</span></code></pre></div>
 <div class="sourceCode" id="cb12"><pre class="downlit sourceCode r">
 <code class="sourceCode R"><span class="co"># get last few rows</span>
 <span class="fu"><a href="https://rdrr.io/r/utils/head.html" class="external-link">tail</a></span><span class="op">(</span><span class="va">flights.df</span>, <span class="fl">1</span><span class="op">)</span>
-<span class="co">#&gt;    year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time</span>
-<span class="co">#&gt; 1: 2013     9  30       NA            840        NA       NA           1020</span>
-<span class="co">#&gt;    arr_delay carrier flight tailnum origin dest air_time distance hour minute</span>
-<span class="co">#&gt; 1:        NA      MQ   3531  N839MQ    LGA  RDU       NA      431    8     40</span>
+<span class="co">#&gt;    year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time arr_delay</span>
+<span class="co">#&gt; 1: 2013     9  30       NA            840        NA       NA           1020        NA</span>
+<span class="co">#&gt;    carrier flight tailnum origin dest air_time distance hour minute</span>
+<span class="co">#&gt; 1:      MQ   3531  N839MQ    LGA  RDU       NA      431    8     40</span>
 <span class="co">#&gt;              time_hour</span>
 <span class="co">#&gt; 1: 2013-09-30 08:00:00</span></code></pre></div>
 <div class="sourceCode" id="cb13"><pre class="downlit sourceCode r">
diff --git a/docs/news/index.html b/docs/news/index.html
index 48d1c26d..551c9495 100644
--- a/docs/news/index.html
+++ b/docs/news/index.html
@@ -17,7 +17,7 @@
       </button>
       <span class="navbar-brand">
         <a class="navbar-link" href="../index.html">disk.frame</a>
-        <span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="">0.6.0</span>
+        <span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="">0.7</span>
       </span>
     </div>
 
@@ -90,6 +90,17 @@ <h1 data-toc-skip>Changelog <small></small></h1>
       <small>Source: <a href="https://github.com/xiaodaigh/disk.frame/blob/HEAD/NEWS.md" class="external-link"><code>NEWS.md</code></a></small>
     </div>
 
+    <div class="section level2">
+<h2 class="page-header" data-toc-text="0.7" id="diskframe-07">disk.frame 0.7<a class="anchor" aria-label="anchor" href="#diskframe-07"></a></h2>
+<ul><li>Partitioned by folder</li>
+<li>Updated R version to 4</li>
+</ul></div>
+    <div class="section level2">
+<h2 class="page-header" data-toc-text="0.6.1" id="diskframe-061">disk.frame 0.6.1<small>2022-02-01</small><a class="anchor" aria-label="anchor" href="#diskframe-061"></a></h2>
+<ul><li>Fixed bug with data.table syntax</li>
+<li>Auto detection of srckeep in group by</li>
+<li>Global detection for group by and summarise</li>
+</ul></div>
     <div class="section level2">
 <h2 class="page-header" data-toc-text="0.6" id="diskframe-06">disk.frame 0.6<a class="anchor" aria-label="anchor" href="#diskframe-06"></a></h2>
 <ul><li>Much better NSE support in disk.frame!</li>
diff --git a/docs/pkgdown.yml b/docs/pkgdown.yml
index d6970ada..73b2fde8 100644
--- a/docs/pkgdown.yml
+++ b/docs/pkgdown.yml
@@ -14,5 +14,5 @@ articles:
   10-group-by: 10-group-by.html
   11-custom-group-by: 11-custom-group-by.html
   88-trouble-shooting: 88-trouble-shooting.html
-last_built: 2022-01-30T13:34Z
+last_built: 2022-02-02T11:20Z
 
diff --git a/docs/reference/add_chunk.html b/docs/reference/add_chunk.html
index 33227b07..6f1c0da0 100644
--- a/docs/reference/add_chunk.html
+++ b/docs/reference/add_chunk.html
@@ -18,7 +18,7 @@
       </button>
       <span class="navbar-brand">
         <a class="navbar-link" href="../index.html">disk.frame</a>
-        <span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="">0.6.0</span>
+        <span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="">0.7</span>
       </span>
     </div>
 
@@ -135,12 +135,12 @@ <h2>Examples</h2>
 <span class="r-in"></span>
 <span class="r-in"><span class="co"># add a chunk to diskf</span></span>
 <span class="r-in"><span class="fu">add_chunk</span><span class="op">(</span><span class="va">diskf</span>, <span class="va">cars</span><span class="op">)</span></span>
-<span class="r-msg co"><span class="r-pr">#&gt;</span> path: "C:\Users\RTX2080\AppData\Local\Temp\RtmpyknGIm/tmp_add_chunk"</span>
+<span class="r-msg co"><span class="r-pr">#&gt;</span> path: "C:\Users\RTX2080\AppData\Local\Temp\RtmpUX5lOH/tmp_add_chunk"</span>
 <span class="r-msg co"><span class="r-pr">#&gt;</span> nchunks: 1</span>
 <span class="r-msg co"><span class="r-pr">#&gt;</span> nrow (at source): 50</span>
 <span class="r-msg co"><span class="r-pr">#&gt;</span> ncol (at source): 2</span>
 <span class="r-in"><span class="fu">add_chunk</span><span class="op">(</span><span class="va">diskf</span>, <span class="va">cars</span><span class="op">)</span></span>
-<span class="r-msg co"><span class="r-pr">#&gt;</span> path: "C:\Users\RTX2080\AppData\Local\Temp\RtmpyknGIm/tmp_add_chunk"</span>
+<span class="r-msg co"><span class="r-pr">#&gt;</span> path: "C:\Users\RTX2080\AppData\Local\Temp\RtmpUX5lOH/tmp_add_chunk"</span>
 <span class="r-msg co"><span class="r-pr">#&gt;</span> nchunks: 2</span>
 <span class="r-msg co"><span class="r-pr">#&gt;</span> nrow (at source): 100</span>
 <span class="r-msg co"><span class="r-pr">#&gt;</span> ncol (at source): 2</span>
@@ -154,12 +154,12 @@ <h2>Examples</h2>
 <span class="r-in"><span class="co"># you wish to add multiple chunk in parralel</span></span>
 <span class="r-in"></span>
 <span class="r-in"><span class="fu">add_chunk</span><span class="op">(</span><span class="va">df2</span>, <span class="fu"><a href="https://rdrr.io/r/base/data.frame.html" class="external-link">data.frame</a></span><span class="op">(</span>chunk<span class="op">=</span><span class="fl">1</span><span class="op">)</span>, <span class="fl">1</span><span class="op">)</span></span>
-<span class="r-msg co"><span class="r-pr">#&gt;</span> path: "C:\Users\RTX2080\AppData\Local\Temp\RtmpyknGIm/tmp_add_chunk2"</span>
+<span class="r-msg co"><span class="r-pr">#&gt;</span> path: "C:\Users\RTX2080\AppData\Local\Temp\RtmpUX5lOH/tmp_add_chunk2"</span>
 <span class="r-msg co"><span class="r-pr">#&gt;</span> nchunks: 1</span>
 <span class="r-msg co"><span class="r-pr">#&gt;</span> nrow (at source): 1</span>
 <span class="r-msg co"><span class="r-pr">#&gt;</span> ncol (at source): 1</span>
 <span class="r-in"><span class="fu">add_chunk</span><span class="op">(</span><span class="va">df2</span>, <span class="fu"><a href="https://rdrr.io/r/base/data.frame.html" class="external-link">data.frame</a></span><span class="op">(</span>chunk<span class="op">=</span><span class="fl">2</span><span class="op">)</span>, <span class="fl">3</span><span class="op">)</span></span>
-<span class="r-msg co"><span class="r-pr">#&gt;</span> path: "C:\Users\RTX2080\AppData\Local\Temp\RtmpyknGIm/tmp_add_chunk2"</span>
+<span class="r-msg co"><span class="r-pr">#&gt;</span> path: "C:\Users\RTX2080\AppData\Local\Temp\RtmpUX5lOH/tmp_add_chunk2"</span>
 <span class="r-msg co"><span class="r-pr">#&gt;</span> nchunks: 2</span>
 <span class="r-msg co"><span class="r-pr">#&gt;</span> nrow (at source): 2</span>
 <span class="r-msg co"><span class="r-pr">#&gt;</span> ncol (at source): 1</span>
diff --git a/docs/reference/as.data.frame.disk.frame.html b/docs/reference/as.data.frame.disk.frame.html
index aea1586c..e93f1492 100644
--- a/docs/reference/as.data.frame.disk.frame.html
+++ b/docs/reference/as.data.frame.disk.frame.html
@@ -17,7 +17,7 @@
       </button>
       <span class="navbar-brand">
         <a class="navbar-link" href="../index.html">disk.frame</a>
-        <span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="">0.6.0</span>
+        <span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="">0.7</span>
       </span>
     </div>
 
diff --git a/docs/reference/as.data.table.disk.frame.html b/docs/reference/as.data.table.disk.frame.html
index 1e5146c2..02416086 100644
--- a/docs/reference/as.data.table.disk.frame.html
+++ b/docs/reference/as.data.table.disk.frame.html
@@ -17,7 +17,7 @@
       </button>
       <span class="navbar-brand">
         <a class="navbar-link" href="../index.html">disk.frame</a>
-        <span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="">0.6.0</span>
+        <span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="">0.7</span>
       </span>
     </div>
 
diff --git a/docs/reference/as.disk.frame.html b/docs/reference/as.disk.frame.html
index fa22a8ea..655a13b1 100644
--- a/docs/reference/as.disk.frame.html
+++ b/docs/reference/as.disk.frame.html
@@ -17,7 +17,7 @@
       </button>
       <span class="navbar-brand">
         <a class="navbar-link" href="../index.html">disk.frame</a>
-        <span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="">0.6.0</span>
+        <span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="">0.7</span>
       </span>
     </div>
 
diff --git a/docs/reference/bind_rows.disk.frame.html b/docs/reference/bind_rows.disk.frame.html
index 3a336451..0f028244 100644
--- a/docs/reference/bind_rows.disk.frame.html
+++ b/docs/reference/bind_rows.disk.frame.html
@@ -17,7 +17,7 @@
       </button>
       <span class="navbar-brand">
         <a class="navbar-link" href="../index.html">disk.frame</a>
-        <span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="">0.6.0</span>
+        <span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="">0.7</span>
       </span>
     </div>
 
diff --git a/docs/reference/chunk_group_by.html b/docs/reference/chunk_group_by.html
index f7bbff69..52940b9f 100644
--- a/docs/reference/chunk_group_by.html
+++ b/docs/reference/chunk_group_by.html
@@ -34,7 +34,7 @@
       </button>
       <span class="navbar-brand">
         <a class="navbar-link" href="../index.html">disk.frame</a>
-        <span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="">0.6.0</span>
+        <span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="">0.7</span>
       </span>
     </div>
 
diff --git a/docs/reference/cmap.html b/docs/reference/cmap.html
index 1a518792..5909c5a5 100644
--- a/docs/reference/cmap.html
+++ b/docs/reference/cmap.html
@@ -21,7 +21,7 @@
       </button>
       <span class="navbar-brand">
         <a class="navbar-link" href="../index.html">disk.frame</a>
-        <span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="">0.6.0</span>
+        <span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="">0.7</span>
       </span>
     </div>
 
@@ -257,7 +257,7 @@ <h2>Examples</h2>
 <span class="r-in"></span>
 <span class="r-in"><span class="co"># return the first row of each chunk eagerly as list</span></span>
 <span class="r-in"><span class="fu">cmap</span><span class="op">(</span><span class="va">cars.df</span>, <span class="op">~</span><span class="va">.x</span><span class="op">[</span><span class="fl">1</span>,<span class="op">]</span>, lazy <span class="op">=</span> <span class="cn">FALSE</span><span class="op">)</span></span>
-<span class="r-msg co"><span class="r-pr">#&gt;</span> path: "C:\Users\RTX2080\AppData\Local\Temp\RtmpyknGIm\file4718220342f7.df"</span>
+<span class="r-msg co"><span class="r-pr">#&gt;</span> path: "C:\Users\RTX2080\AppData\Local\Temp\RtmpUX5lOH\file1c385263150e.df"</span>
 <span class="r-msg co"><span class="r-pr">#&gt;</span> nchunks: 6</span>
 <span class="r-msg co"><span class="r-pr">#&gt;</span> nrow (at source): 50</span>
 <span class="r-msg co"><span class="r-pr">#&gt;</span> ncol (at source): 2</span>
diff --git a/docs/reference/cmap2.html b/docs/reference/cmap2.html
index 336c10b5..5a9bf032 100644
--- a/docs/reference/cmap2.html
+++ b/docs/reference/cmap2.html
@@ -18,7 +18,7 @@
       </button>
       <span class="navbar-brand">
         <a class="navbar-link" href="../index.html">disk.frame</a>
-        <span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="">0.6.0</span>
+        <span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="">0.7</span>
       </span>
     </div>
 
diff --git a/docs/reference/collect.html b/docs/reference/collect.html
index 0d063a49..358f66f3 100644
--- a/docs/reference/collect.html
+++ b/docs/reference/collect.html
@@ -20,7 +20,7 @@
       </button>
       <span class="navbar-brand">
         <a class="navbar-link" href="../index.html">disk.frame</a>
-        <span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="">0.6.0</span>
+        <span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="">0.7</span>
       </span>
     </div>
 
diff --git a/docs/reference/colnames.html b/docs/reference/colnames.html
index 210eada5..0956e34d 100644
--- a/docs/reference/colnames.html
+++ b/docs/reference/colnames.html
@@ -20,7 +20,7 @@
       </button>
       <span class="navbar-brand">
         <a class="navbar-link" href="../index.html">disk.frame</a>
-        <span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="">0.6.0</span>
+        <span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="">0.7</span>
       </span>
     </div>
 
diff --git a/docs/reference/compute.disk.frame.html b/docs/reference/compute.disk.frame.html
index 5e9a9f2c..bf8e3c87 100644
--- a/docs/reference/compute.disk.frame.html
+++ b/docs/reference/compute.disk.frame.html
@@ -17,7 +17,7 @@
       </button>
       <span class="navbar-brand">
         <a class="navbar-link" href="../index.html">disk.frame</a>
-        <span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="">0.6.0</span>
+        <span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="">0.7</span>
       </span>
     </div>
 
diff --git a/docs/reference/create_chunk_mapper.html b/docs/reference/create_chunk_mapper.html
index 1269b5dd..3a425805 100644
--- a/docs/reference/create_chunk_mapper.html
+++ b/docs/reference/create_chunk_mapper.html
@@ -17,7 +17,7 @@
       </button>
       <span class="navbar-brand">
         <a class="navbar-link" href="../index.html">disk.frame</a>
-        <span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="">0.6.0</span>
+        <span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="">0.7</span>
       </span>
     </div>
 
diff --git a/docs/reference/csv_to_disk.frame.html b/docs/reference/csv_to_disk.frame.html
index f91d899a..7e59e587 100644
--- a/docs/reference/csv_to_disk.frame.html
+++ b/docs/reference/csv_to_disk.frame.html
@@ -17,7 +17,7 @@
       </button>
       <span class="navbar-brand">
         <a class="navbar-link" href="../index.html">disk.frame</a>
-        <span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="">0.6.0</span>
+        <span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="">0.7</span>
       </span>
     </div>
 
diff --git a/docs/reference/delete.html b/docs/reference/delete.html
index 9cf26191..5ac2c8de 100644
--- a/docs/reference/delete.html
+++ b/docs/reference/delete.html
@@ -17,7 +17,7 @@
       </button>
       <span class="navbar-brand">
         <a class="navbar-link" href="../index.html">disk.frame</a>
-        <span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="">0.6.0</span>
+        <span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="">0.7</span>
       </span>
     </div>
 
diff --git a/docs/reference/df_ram_size.html b/docs/reference/df_ram_size.html
index c986617f..898899a8 100644
--- a/docs/reference/df_ram_size.html
+++ b/docs/reference/df_ram_size.html
@@ -17,7 +17,7 @@
       </button>
       <span class="navbar-brand">
         <a class="navbar-link" href="../index.html">disk.frame</a>
-        <span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="">0.6.0</span>
+        <span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="">0.7</span>
       </span>
     </div>
 
diff --git a/docs/reference/dfglm.html b/docs/reference/dfglm.html
index c9cc7c73..b6b3dd12 100644
--- a/docs/reference/dfglm.html
+++ b/docs/reference/dfglm.html
@@ -18,7 +18,7 @@
       </button>
       <span class="navbar-brand">
         <a class="navbar-link" href="../index.html">disk.frame</a>
-        <span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="">0.6.0</span>
+        <span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="">0.7</span>
       </span>
     </div>
 
diff --git a/docs/reference/disk.frame.html b/docs/reference/disk.frame.html
index fd1f9556..e4c01f98 100644
--- a/docs/reference/disk.frame.html
+++ b/docs/reference/disk.frame.html
@@ -17,7 +17,7 @@
       </button>
       <span class="navbar-brand">
         <a class="navbar-link" href="../index.html">disk.frame</a>
-        <span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="">0.6.0</span>
+        <span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="">0.7</span>
       </span>
     </div>
 
@@ -111,7 +111,7 @@ <h2>Arguments</h2>
     <h2>Examples</h2>
     <div class="sourceCode"><pre class="sourceCode r"><code><span class="r-in"><span class="va">path</span> <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/file.path.html" class="external-link">file.path</a></span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/base/tempfile.html" class="external-link">tempdir</a></span><span class="op">(</span><span class="op">)</span>,<span class="st">"cars"</span><span class="op">)</span></span>
 <span class="r-in"><span class="fu"><a href="as.disk.frame.html">as.disk.frame</a></span><span class="op">(</span><span class="va">cars</span>, outdir<span class="op">=</span><span class="va">path</span>, overwrite <span class="op">=</span> <span class="cn">TRUE</span>, nchunks <span class="op">=</span> <span class="fl">2</span><span class="op">)</span></span>
-<span class="r-msg co"><span class="r-pr">#&gt;</span> path: "C:\Users\RTX2080\AppData\Local\Temp\RtmpyknGIm/cars"</span>
+<span class="r-msg co"><span class="r-pr">#&gt;</span> path: "C:\Users\RTX2080\AppData\Local\Temp\RtmpUX5lOH/cars"</span>
 <span class="r-msg co"><span class="r-pr">#&gt;</span> nchunks: 2</span>
 <span class="r-msg co"><span class="r-pr">#&gt;</span> nrow (at source): 50</span>
 <span class="r-msg co"><span class="r-pr">#&gt;</span> ncol (at source): 2</span>
diff --git a/docs/reference/dplyr_verbs.html b/docs/reference/dplyr_verbs.html
index 143e9c7e..ff377e41 100644
--- a/docs/reference/dplyr_verbs.html
+++ b/docs/reference/dplyr_verbs.html
@@ -18,7 +18,7 @@
       </button>
       <span class="navbar-brand">
         <a class="navbar-link" href="../index.html">disk.frame</a>
-        <span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="">0.6.0</span>
+        <span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="">0.7</span>
       </span>
     </div>
 
diff --git a/docs/reference/evalparseglue.html b/docs/reference/evalparseglue.html
index d88bc80a..78e280de 100644
--- a/docs/reference/evalparseglue.html
+++ b/docs/reference/evalparseglue.html
@@ -17,7 +17,7 @@
       </button>
       <span class="navbar-brand">
         <a class="navbar-link" href="../index.html">disk.frame</a>
-        <span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="">0.6.0</span>
+        <span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="">0.7</span>
       </span>
     </div>
 
diff --git a/docs/reference/find_globals_recursively.html b/docs/reference/find_globals_recursively.html
new file mode 100644
index 00000000..b19e8a83
--- /dev/null
+++ b/docs/reference/find_globals_recursively.html
@@ -0,0 +1,133 @@
+<!DOCTYPE html>
+<!-- Generated by pkgdown: do not edit by hand --><html lang="en"><head><meta http-equiv="Content-Type" content="text/html; charset=UTF-8"><meta charset="utf-8"><meta http-equiv="X-UA-Compatible" content="IE=edge"><meta name="viewport" content="width=device-width, initial-scale=1.0"><title>Find globals in an expression by searching through the chain — find_globals_recursively • disk.frame</title><!-- jquery --><script src="https://cdnjs.cloudflare.com/ajax/libs/jquery/3.4.1/jquery.min.js" integrity="sha256-CSXorXvZcTkaix6Yvo6HppcZGetbYMGWSFlBw8HfCJo=" crossorigin="anonymous"></script><!-- Bootstrap --><link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/twitter-bootstrap/3.4.1/css/bootstrap.min.css" integrity="sha256-bZLfwXAP04zRMK2BjiO8iu9pf4FbLqX6zitd+tIvLhE=" crossorigin="anonymous"><script src="https://cdnjs.cloudflare.com/ajax/libs/twitter-bootstrap/3.4.1/js/bootstrap.min.js" integrity="sha256-nuL8/2cJ5NDSSwnKD8VqreErSWHtnEP9E7AySL+1ev4=" crossorigin="anonymous"></script><!-- bootstrap-toc --><link rel="stylesheet" href="../bootstrap-toc.css"><script src="../bootstrap-toc.js"></script><!-- Font Awesome icons --><link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/5.12.1/css/all.min.css" integrity="sha256-mmgLkCYLUQbXn0B1SRqzHar6dCnv9oZFPEC1g1cwlkk=" crossorigin="anonymous"><link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/5.12.1/css/v4-shims.min.css" integrity="sha256-wZjR52fzng1pJHwx4aV2AO3yyTOXrcDW7jBpJtTwVxw=" crossorigin="anonymous"><!-- clipboard.js --><script src="https://cdnjs.cloudflare.com/ajax/libs/clipboard.js/2.0.6/clipboard.min.js" integrity="sha256-inc5kl9MA1hkeYUt+EC3BhlIgyp/2jDIyBLS6k3UxPI=" crossorigin="anonymous"></script><!-- headroom.js --><script src="https://cdnjs.cloudflare.com/ajax/libs/headroom/0.11.0/headroom.min.js" integrity="sha256-AsUX4SJE1+yuDu5+mAVzJbuYNPHj/WroHuZ8Ir/CkE0=" crossorigin="anonymous"></script><script src="https://cdnjs.cloudflare.com/ajax/libs/headroom/0.11.0/jQuery.headroom.min.js" integrity="sha256-ZX/yNShbjqsohH1k95liqY9Gd8uOiE1S4vZc+9KQ1K4=" crossorigin="anonymous"></script><!-- pkgdown --><link href="../pkgdown.css" rel="stylesheet"><script src="../pkgdown.js"></script><meta property="og:title" content="Find globals in an expression by searching through the chain — find_globals_recursively"><meta property="og:description" content="Find globals in an expression by searching through the chain"><!-- mathjax --><script src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/MathJax.js" integrity="sha256-nvJJv9wWKEm88qvoQl9ekL2J+k/RWIsaSScxxlsrv8k=" crossorigin="anonymous"></script><script src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/config/TeX-AMS-MML_HTMLorMML.js" integrity="sha256-84DKXVJXs0/F8OTMzX4UR909+jtl4G7SPypPavF+GfA=" crossorigin="anonymous"></script><!--[if lt IE 9]>
+<script src="https://oss.maxcdn.com/html5shiv/3.7.3/html5shiv.min.js"></script>
+<script src="https://oss.maxcdn.com/respond/1.4.2/respond.min.js"></script>
+<![endif]--></head><body data-spy="scroll" data-target="#toc">
+    
+
+    <div class="container template-reference-topic">
+      <header><div class="navbar navbar-default navbar-fixed-top" role="navigation">
+  <div class="container">
+    <div class="navbar-header">
+      <button type="button" class="navbar-toggle collapsed" data-toggle="collapse" data-target="#navbar" aria-expanded="false">
+        <span class="sr-only">Toggle navigation</span>
+        <span class="icon-bar"></span>
+        <span class="icon-bar"></span>
+        <span class="icon-bar"></span>
+      </button>
+      <span class="navbar-brand">
+        <a class="navbar-link" href="../index.html">disk.frame</a>
+        <span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="">0.7</span>
+      </span>
+    </div>
+
+    <div id="navbar" class="navbar-collapse collapse">
+      <ul class="nav navbar-nav"><li>
+  <a href="../reference/index.html">Reference</a>
+</li>
+<li class="dropdown">
+  <a href="#" class="dropdown-toggle" data-toggle="dropdown" role="button" aria-expanded="false">
+    Articles
+     
+    <span class="caret"></span>
+  </a>
+  <ul class="dropdown-menu" role="menu"><li>
+      <a href="../articles/01-intro.html">Preface - The birth of `disk.frame`</a>
+    </li>
+    <li>
+      <a href="../articles/02-intro-disk-frame.html">Quick Start: Basic Operations with nycflights13</a>
+    </li>
+    <li>
+      <a href="../articles/03-concepts.html">Key `{disk.frame}` concepts</a>
+    </li>
+    <li>
+      <a href="../articles/04-ingesting-data.html">Ingesting Data</a>
+    </li>
+    <li>
+      <a href="../articles/05-data-table-syntax.html">Using data.table syntax with disk.frame</a>
+    </li>
+    <li>
+      <a href="../articles/06-vs-dask-juliadb.html">Benchmarks 1: disk.frame beats Dask! disk.frame beats JuliaDB! Anyone else wanna challenge?</a>
+    </li>
+    <li>
+      <a href="../articles/07-glm.html">Generalized Linear Models (GLM) including logistic regression with disk.frame</a>
+    </li>
+    <li>
+      <a href="../articles/08-more-epic.html">{disk.frame} can be more 'epic'</a>
+    </li>
+    <li>
+      <a href="../articles/09-convenience-features.html">Convenience features</a>
+    </li>
+    <li>
+      <a href="../articles/10-group-by.html">Group-by</a>
+    </li>
+    <li>
+      <a href="../articles/11-custom-group-by.html">Custom One-Stage Group-by functions</a>
+    </li>
+    <li>
+      <a href="../articles/88-trouble-shooting.html">Trouble shooting</a>
+    </li>
+  </ul></li>
+<li>
+  <a href="../news/index.html">Changelog</a>
+</li>
+      </ul><ul class="nav navbar-nav navbar-right"><li>
+  <a href="https://github.com/xiaodaigh/disk.frame/" class="external-link">
+    <span class="fab fa-github fa-lg"></span>
+     
+  </a>
+</li>
+      </ul></div><!--/.nav-collapse -->
+  </div><!--/.container -->
+</div><!--/.navbar -->
+
+      
+
+      </header><div class="row">
+  <div class="col-md-9 contents">
+    <div class="page-header">
+    <h1>Find globals in an expression by searching through the chain</h1>
+    <small class="dont-index">Source: <a href="https://github.com/xiaodaigh/disk.frame/blob/HEAD/R/util.r" class="external-link"><code>R/util.r</code></a></small>
+    <div class="hidden name"><code>find_globals_recursively.Rd</code></div>
+    </div>
+
+    <div class="ref-description">
+    <p>Find globals in an expression by searching through the chain</p>
+    </div>
+
+    <div id="ref-usage">
+    <div class="sourceCode"><pre class="sourceCode r"><code><span class="fu">find_globals_recursively</span><span class="op">(</span><span class="va">code</span>, <span class="va">envir</span><span class="op">)</span></code></pre></div>
+    </div>
+
+    <div id="arguments">
+    <h2>Arguments</h2>
+    <dl><dt>code</dt>
+<dd><p>An expression to search for globals</p></dd>
+<dt>envir</dt>
+<dd><p>The environment from which to begin the search</p></dd>
+</dl></div>
+
+  </div>
+  <div class="col-md-3 hidden-xs hidden-sm" id="pkgdown-sidebar">
+    <nav id="toc" data-toggle="toc" class="sticky-top"><h2 data-toc-skip>Contents</h2>
+    </nav></div>
+</div>
+
+
+      <footer><div class="copyright">
+  <p></p><p>Developed by Dai ZJ.</p>
+</div>
+
+<div class="pkgdown">
+  <p></p><p>Site built with <a href="https://pkgdown.r-lib.org/" class="external-link">pkgdown</a> 2.0.2.</p>
+</div>
+
+      </footer></div>
+
+  
+
+
+  
+
+  </body></html>
+
diff --git a/docs/reference/foverlaps.disk.frame.html b/docs/reference/foverlaps.disk.frame.html
index 6fbc7457..383c9d0a 100644
--- a/docs/reference/foverlaps.disk.frame.html
+++ b/docs/reference/foverlaps.disk.frame.html
@@ -17,7 +17,7 @@
       </button>
       <span class="navbar-brand">
         <a class="navbar-link" href="../index.html">disk.frame</a>
-        <span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="">0.6.0</span>
+        <span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="">0.7</span>
       </span>
     </div>
 
diff --git a/docs/reference/gen_datatable_synthetic.html b/docs/reference/gen_datatable_synthetic.html
index 8855f4c1..f890a9a7 100644
--- a/docs/reference/gen_datatable_synthetic.html
+++ b/docs/reference/gen_datatable_synthetic.html
@@ -17,7 +17,7 @@
       </button>
       <span class="navbar-brand">
         <a class="navbar-link" href="../index.html">disk.frame</a>
-        <span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="">0.6.0</span>
+        <span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="">0.7</span>
       </span>
     </div>
 
diff --git a/docs/reference/get_chunk.html b/docs/reference/get_chunk.html
index fffdd1c7..a7e53e5b 100644
--- a/docs/reference/get_chunk.html
+++ b/docs/reference/get_chunk.html
@@ -17,7 +17,7 @@
       </button>
       <span class="navbar-brand">
         <a class="navbar-link" href="../index.html">disk.frame</a>
-        <span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="">0.6.0</span>
+        <span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="">0.7</span>
       </span>
     </div>
 
@@ -99,7 +99,7 @@ <h1>Obtain one chunk by chunk id</h1>
     <div class="sourceCode"><pre class="sourceCode r"><code><span class="fu">get_chunk</span><span class="op">(</span><span class="va">...</span><span class="op">)</span>
 
 <span class="co"># S3 method for disk.frame</span>
-<span class="fu">get_chunk</span><span class="op">(</span><span class="va">df</span>, <span class="va">n</span>, keep <span class="op">=</span> <span class="cn">NULL</span>, full.names <span class="op">=</span> <span class="cn">FALSE</span>, <span class="va">...</span><span class="op">)</span></code></pre></div>
+<span class="fu">get_chunk</span><span class="op">(</span><span class="va">df</span>, <span class="va">n</span>, keep <span class="op">=</span> <span class="cn">NULL</span>, full.names <span class="op">=</span> <span class="cn">FALSE</span>, <span class="va">...</span>, partitioned_info <span class="op">=</span> <span class="cn">NULL</span><span class="op">)</span></code></pre></div>
     </div>
 
     <div id="arguments">
@@ -114,6 +114,8 @@ <h2>Arguments</h2>
 <dd><p>the columns to keep</p></dd>
 <dt>full.names</dt>
 <dd><p>whether n is the full path to the chunks or just a relative path file name. Ignored if n is numeric</p></dd>
+<dt>partitioned_info</dt>
+<dd><p>for internal use only. It's a data.frame used to help with filtering by partitions</p></dd>
 </dl></div>
 
     <div id="ref-examples">
diff --git a/docs/reference/get_chunk_ids.html b/docs/reference/get_chunk_ids.html
index fc6d82da..ac2822dc 100644
--- a/docs/reference/get_chunk_ids.html
+++ b/docs/reference/get_chunk_ids.html
@@ -17,7 +17,7 @@
       </button>
       <span class="navbar-brand">
         <a class="navbar-link" href="../index.html">disk.frame</a>
-        <span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="">0.6.0</span>
+        <span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="">0.7</span>
       </span>
     </div>
 
@@ -117,20 +117,22 @@ <h2>Examples</h2>
 <span class="r-in"></span>
 <span class="r-in"><span class="co"># return the integer-string chunk IDs</span></span>
 <span class="r-in"><span class="fu">get_chunk_ids</span><span class="op">(</span><span class="va">cars.df</span><span class="op">)</span></span>
-<span class="r-out co"><span class="r-pr">#&gt;</span> [1] "1" "2" "3" "4" "5" "6"</span>
+<span class="r-out co"><span class="r-pr">#&gt;</span> 1.fst 2.fst 3.fst 4.fst 5.fst 6.fst </span>
+<span class="r-out co"><span class="r-pr">#&gt;</span>   "1"   "2"   "3"   "4"   "5"   "6" </span>
 <span class="r-in"></span>
 <span class="r-in"><span class="co"># return the file name chunk IDs</span></span>
 <span class="r-in"><span class="fu">get_chunk_ids</span><span class="op">(</span><span class="va">cars.df</span>, full.names <span class="op">=</span> <span class="cn">TRUE</span><span class="op">)</span></span>
-<span class="r-out co"><span class="r-pr">#&gt;</span> [1] "C:\\Users\\RTX2080\\AppData\\Local\\Temp\\RtmpyknGIm\\file471836dbe43.df/1.fst"</span>
-<span class="r-out co"><span class="r-pr">#&gt;</span> [2] "C:\\Users\\RTX2080\\AppData\\Local\\Temp\\RtmpyknGIm\\file471836dbe43.df/2.fst"</span>
-<span class="r-out co"><span class="r-pr">#&gt;</span> [3] "C:\\Users\\RTX2080\\AppData\\Local\\Temp\\RtmpyknGIm\\file471836dbe43.df/3.fst"</span>
-<span class="r-out co"><span class="r-pr">#&gt;</span> [4] "C:\\Users\\RTX2080\\AppData\\Local\\Temp\\RtmpyknGIm\\file471836dbe43.df/4.fst"</span>
-<span class="r-out co"><span class="r-pr">#&gt;</span> [5] "C:\\Users\\RTX2080\\AppData\\Local\\Temp\\RtmpyknGIm\\file471836dbe43.df/5.fst"</span>
-<span class="r-out co"><span class="r-pr">#&gt;</span> [6] "C:\\Users\\RTX2080\\AppData\\Local\\Temp\\RtmpyknGIm\\file471836dbe43.df/6.fst"</span>
+<span class="r-out co"><span class="r-pr">#&gt;</span> [1] "C:\\Users\\RTX2080\\AppData\\Local\\Temp\\RtmpUX5lOH\\file1c385fab1acf.df/1.fst"</span>
+<span class="r-out co"><span class="r-pr">#&gt;</span> [2] "C:\\Users\\RTX2080\\AppData\\Local\\Temp\\RtmpUX5lOH\\file1c385fab1acf.df/2.fst"</span>
+<span class="r-out co"><span class="r-pr">#&gt;</span> [3] "C:\\Users\\RTX2080\\AppData\\Local\\Temp\\RtmpUX5lOH\\file1c385fab1acf.df/3.fst"</span>
+<span class="r-out co"><span class="r-pr">#&gt;</span> [4] "C:\\Users\\RTX2080\\AppData\\Local\\Temp\\RtmpUX5lOH\\file1c385fab1acf.df/4.fst"</span>
+<span class="r-out co"><span class="r-pr">#&gt;</span> [5] "C:\\Users\\RTX2080\\AppData\\Local\\Temp\\RtmpUX5lOH\\file1c385fab1acf.df/5.fst"</span>
+<span class="r-out co"><span class="r-pr">#&gt;</span> [6] "C:\\Users\\RTX2080\\AppData\\Local\\Temp\\RtmpUX5lOH\\file1c385fab1acf.df/6.fst"</span>
 <span class="r-in"></span>
 <span class="r-in"><span class="co"># return the file name chunk IDs with file extension</span></span>
 <span class="r-in"><span class="fu">get_chunk_ids</span><span class="op">(</span><span class="va">cars.df</span>, strip_extension <span class="op">=</span> <span class="cn">FALSE</span><span class="op">)</span></span>
-<span class="r-out co"><span class="r-pr">#&gt;</span> [1] "1.fst" "2.fst" "3.fst" "4.fst" "5.fst" "6.fst"</span>
+<span class="r-out co"><span class="r-pr">#&gt;</span>   1.fst   2.fst   3.fst   4.fst   5.fst   6.fst </span>
+<span class="r-out co"><span class="r-pr">#&gt;</span> "1.fst" "2.fst" "3.fst" "4.fst" "5.fst" "6.fst" </span>
 <span class="r-in"></span>
 <span class="r-in"><span class="co"># clean up cars.df</span></span>
 <span class="r-in"><span class="fu"><a href="delete.html">delete</a></span><span class="op">(</span><span class="va">cars.df</span><span class="op">)</span></span>
diff --git a/docs/reference/get_partition_paths.html b/docs/reference/get_partition_paths.html
new file mode 100644
index 00000000..7dff33ce
--- /dev/null
+++ b/docs/reference/get_partition_paths.html
@@ -0,0 +1,132 @@
+<!DOCTYPE html>
+<!-- Generated by pkgdown: do not edit by hand --><html lang="en"><head><meta http-equiv="Content-Type" content="text/html; charset=UTF-8"><meta charset="utf-8"><meta http-equiv="X-UA-Compatible" content="IE=edge"><meta name="viewport" content="width=device-width, initial-scale=1.0"><title>Get the partitioning structure of a folder — get_partition_paths • disk.frame</title><!-- jquery --><script src="https://cdnjs.cloudflare.com/ajax/libs/jquery/3.4.1/jquery.min.js" integrity="sha256-CSXorXvZcTkaix6Yvo6HppcZGetbYMGWSFlBw8HfCJo=" crossorigin="anonymous"></script><!-- Bootstrap --><link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/twitter-bootstrap/3.4.1/css/bootstrap.min.css" integrity="sha256-bZLfwXAP04zRMK2BjiO8iu9pf4FbLqX6zitd+tIvLhE=" crossorigin="anonymous"><script src="https://cdnjs.cloudflare.com/ajax/libs/twitter-bootstrap/3.4.1/js/bootstrap.min.js" integrity="sha256-nuL8/2cJ5NDSSwnKD8VqreErSWHtnEP9E7AySL+1ev4=" crossorigin="anonymous"></script><!-- bootstrap-toc --><link rel="stylesheet" href="../bootstrap-toc.css"><script src="../bootstrap-toc.js"></script><!-- Font Awesome icons --><link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/5.12.1/css/all.min.css" integrity="sha256-mmgLkCYLUQbXn0B1SRqzHar6dCnv9oZFPEC1g1cwlkk=" crossorigin="anonymous"><link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/5.12.1/css/v4-shims.min.css" integrity="sha256-wZjR52fzng1pJHwx4aV2AO3yyTOXrcDW7jBpJtTwVxw=" crossorigin="anonymous"><!-- clipboard.js --><script src="https://cdnjs.cloudflare.com/ajax/libs/clipboard.js/2.0.6/clipboard.min.js" integrity="sha256-inc5kl9MA1hkeYUt+EC3BhlIgyp/2jDIyBLS6k3UxPI=" crossorigin="anonymous"></script><!-- headroom.js --><script src="https://cdnjs.cloudflare.com/ajax/libs/headroom/0.11.0/headroom.min.js" integrity="sha256-AsUX4SJE1+yuDu5+mAVzJbuYNPHj/WroHuZ8Ir/CkE0=" crossorigin="anonymous"></script><script src="https://cdnjs.cloudflare.com/ajax/libs/headroom/0.11.0/jQuery.headroom.min.js" integrity="sha256-ZX/yNShbjqsohH1k95liqY9Gd8uOiE1S4vZc+9KQ1K4=" crossorigin="anonymous"></script><!-- pkgdown --><link href="../pkgdown.css" rel="stylesheet"><script src="../pkgdown.js"></script><meta property="og:title" content="Get the partitioning structure of a folder — get_partition_paths"><meta property="og:description" content="Get the partitioning structure of a folder"><!-- mathjax --><script src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/MathJax.js" integrity="sha256-nvJJv9wWKEm88qvoQl9ekL2J+k/RWIsaSScxxlsrv8k=" crossorigin="anonymous"></script><script src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/config/TeX-AMS-MML_HTMLorMML.js" integrity="sha256-84DKXVJXs0/F8OTMzX4UR909+jtl4G7SPypPavF+GfA=" crossorigin="anonymous"></script><!--[if lt IE 9]>
+<script src="https://oss.maxcdn.com/html5shiv/3.7.3/html5shiv.min.js"></script>
+<script src="https://oss.maxcdn.com/respond/1.4.2/respond.min.js"></script>
+<![endif]--></head><body data-spy="scroll" data-target="#toc">
+    
+
+    <div class="container template-reference-topic">
+      <header><div class="navbar navbar-default navbar-fixed-top" role="navigation">
+  <div class="container">
+    <div class="navbar-header">
+      <button type="button" class="navbar-toggle collapsed" data-toggle="collapse" data-target="#navbar" aria-expanded="false">
+        <span class="sr-only">Toggle navigation</span>
+        <span class="icon-bar"></span>
+        <span class="icon-bar"></span>
+        <span class="icon-bar"></span>
+      </button>
+      <span class="navbar-brand">
+        <a class="navbar-link" href="../index.html">disk.frame</a>
+        <span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="">0.7</span>
+      </span>
+    </div>
+
+    <div id="navbar" class="navbar-collapse collapse">
+      <ul class="nav navbar-nav"><li>
+  <a href="../reference/index.html">Reference</a>
+</li>
+<li class="dropdown">
+  <a href="#" class="dropdown-toggle" data-toggle="dropdown" role="button" aria-expanded="false">
+    Articles
+     
+    <span class="caret"></span>
+  </a>
+  <ul class="dropdown-menu" role="menu"><li>
+      <a href="../articles/01-intro.html">Preface - The birth of `disk.frame`</a>
+    </li>
+    <li>
+      <a href="../articles/02-intro-disk-frame.html">Quick Start: Basic Operations with nycflights13</a>
+    </li>
+    <li>
+      <a href="../articles/03-concepts.html">Key `{disk.frame}` concepts</a>
+    </li>
+    <li>
+      <a href="../articles/04-ingesting-data.html">Ingesting Data</a>
+    </li>
+    <li>
+      <a href="../articles/05-data-table-syntax.html">Using data.table syntax with disk.frame</a>
+    </li>
+    <li>
+      <a href="../articles/06-vs-dask-juliadb.html">Benchmarks 1: disk.frame beats Dask! disk.frame beats JuliaDB! Anyone else wanna challenge?</a>
+    </li>
+    <li>
+      <a href="../articles/07-glm.html">Generalized Linear Models (GLM) including logistic regression with disk.frame</a>
+    </li>
+    <li>
+      <a href="../articles/08-more-epic.html">{disk.frame} can be more 'epic'</a>
+    </li>
+    <li>
+      <a href="../articles/09-convenience-features.html">Convenience features</a>
+    </li>
+    <li>
+      <a href="../articles/10-group-by.html">Group-by</a>
+    </li>
+    <li>
+      <a href="../articles/11-custom-group-by.html">Custom One-Stage Group-by functions</a>
+    </li>
+    <li>
+      <a href="../articles/88-trouble-shooting.html">Trouble shooting</a>
+    </li>
+  </ul></li>
+<li>
+  <a href="../news/index.html">Changelog</a>
+</li>
+      </ul><ul class="nav navbar-nav navbar-right"><li>
+  <a href="https://github.com/xiaodaigh/disk.frame/" class="external-link">
+    <span class="fab fa-github fa-lg"></span>
+     
+  </a>
+</li>
+      </ul></div><!--/.nav-collapse -->
+  </div><!--/.container -->
+</div><!--/.navbar -->
+
+      
+
+      </header><div class="row">
+  <div class="col-md-9 contents">
+    <div class="page-header">
+    <h1>Get the partitioning structure of a folder</h1>
+    <small class="dont-index">Source: <a href="https://github.com/xiaodaigh/disk.frame/blob/HEAD/R/get_partition.r" class="external-link"><code>R/get_partition.r</code></a></small>
+    <div class="hidden name"><code>get_partition_paths.Rd</code></div>
+    </div>
+
+    <div class="ref-description">
+    <p>Get the partitioning structure of a folder</p>
+    </div>
+
+    <div id="ref-usage">
+    <div class="sourceCode"><pre class="sourceCode r"><code><span class="fu">get_partition_paths</span><span class="op">(</span><span class="va">df</span><span class="op">)</span></code></pre></div>
+    </div>
+
+    <div id="arguments">
+    <h2>Arguments</h2>
+    <dl><dt>df</dt>
+<dd><p>a disk.frame whose paths will be used to determine if it's
+folder-partitioned disk.frame</p></dd>
+</dl></div>
+
+  </div>
+  <div class="col-md-3 hidden-xs hidden-sm" id="pkgdown-sidebar">
+    <nav id="toc" data-toggle="toc" class="sticky-top"><h2 data-toc-skip>Contents</h2>
+    </nav></div>
+</div>
+
+
+      <footer><div class="copyright">
+  <p></p><p>Developed by Dai ZJ.</p>
+</div>
+
+<div class="pkgdown">
+  <p></p><p>Site built with <a href="https://pkgdown.r-lib.org/" class="external-link">pkgdown</a> 2.0.2.</p>
+</div>
+
+      </footer></div>
+
+  
+
+
+  
+
+  </body></html>
+
diff --git a/docs/reference/group_by.html b/docs/reference/group_by.html
index 7149e006..2a026cdb 100644
--- a/docs/reference/group_by.html
+++ b/docs/reference/group_by.html
@@ -20,7 +20,7 @@
       </button>
       <span class="navbar-brand">
         <a class="navbar-link" href="../index.html">disk.frame</a>
-        <span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="">0.6.0</span>
+        <span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="">0.7</span>
       </span>
     </div>
 
diff --git a/docs/reference/groups.disk.frame.html b/docs/reference/groups.disk.frame.html
index 89dc5d99..d1c2b154 100644
--- a/docs/reference/groups.disk.frame.html
+++ b/docs/reference/groups.disk.frame.html
@@ -17,7 +17,7 @@
       </button>
       <span class="navbar-brand">
         <a class="navbar-link" href="../index.html">disk.frame</a>
-        <span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="">0.6.0</span>
+        <span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="">0.7</span>
       </span>
     </div>
 
diff --git a/docs/reference/head_tail.html b/docs/reference/head_tail.html
index 01e04939..50bd1d56 100644
--- a/docs/reference/head_tail.html
+++ b/docs/reference/head_tail.html
@@ -17,7 +17,7 @@
       </button>
       <span class="navbar-brand">
         <a class="navbar-link" href="../index.html">disk.frame</a>
-        <span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="">0.6.0</span>
+        <span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="">0.7</span>
       </span>
     </div>
 
diff --git a/docs/reference/index.html b/docs/reference/index.html
index 7447de77..264deb04 100644
--- a/docs/reference/index.html
+++ b/docs/reference/index.html
@@ -17,7 +17,7 @@
       </button>
       <span class="navbar-brand">
         <a class="navbar-link" href="../index.html">disk.frame</a>
-        <span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="">0.6.0</span>
+        <span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="">0.7</span>
       </span>
     </div>
 
@@ -176,6 +176,10 @@ <h2 id="all-functions">All functions <a href="#all-functions" class="anchor" ari
           <p><code><a href="evalparseglue.html">evalparseglue()</a></code> </p>
         </td>
         <td><p>Helper function to evalparse some `glue::glue` string</p></td>
+      </tr><tr><td>
+          <p><code><a href="find_globals_recursively.html">find_globals_recursively()</a></code> </p>
+        </td>
+        <td><p>Find globals in an expression by searching through the chain</p></td>
       </tr><tr><td>
           <p><code><a href="foverlaps.disk.frame.html">foverlaps.disk.frame()</a></code> </p>
         </td>
@@ -192,6 +196,10 @@ <h2 id="all-functions">All functions <a href="#all-functions" class="anchor" ari
           <p><code><a href="get_chunk_ids.html">get_chunk_ids()</a></code> </p>
         </td>
         <td><p>Get the chunk IDs and files names</p></td>
+      </tr><tr><td>
+          <p><code><a href="get_partition_paths.html">get_partition_paths()</a></code> </p>
+        </td>
+        <td><p>Get the partitioning structure of a folder</p></td>
       </tr><tr><td>
           <p><code><a href="groups.disk.frame.html">groups(<i>&lt;disk.frame&gt;</i>)</a></code> </p>
         </td>
@@ -240,6 +248,10 @@ <h2 id="all-functions">All functions <a href="#all-functions" class="anchor" ari
           <p><code><a href="overwrite_check.html">overwrite_check()</a></code> </p>
         </td>
         <td><p>Check if the outdir exists or not</p></td>
+      </tr><tr><td>
+          <p><code><a href="partition_filter.html">partition_filter()</a></code> </p>
+        </td>
+        <td><p>Filter the dataset based on folder partitions</p></td>
       </tr><tr><td>
           <p><code><a href="play.html">play()</a></code> </p>
         </td>
@@ -296,6 +308,10 @@ <h2 id="all-functions">All functions <a href="#all-functions" class="anchor" ari
           <p><code><a href="show_ceremony.html">show_ceremony()</a></code> <code><a href="show_ceremony.html">ceremony_text()</a></code> <code><a href="show_ceremony.html">show_boilerplate()</a></code> <code><a href="show_ceremony.html">insert_ceremony()</a></code> </p>
         </td>
         <td><p>Show the code to setup disk.frame</p></td>
+      </tr><tr><td>
+          <p><code><a href="split_string_into_df.html">split_string_into_df()</a></code> </p>
+        </td>
+        <td><p>Turn a string of the form /partion1=val/partion2=val2 into data.frame</p></td>
       </tr><tr><td>
           <p><code><a href="srckeep.html">srckeep()</a></code> </p>
         </td>
diff --git a/docs/reference/is_disk.frame.html b/docs/reference/is_disk.frame.html
index d32d0036..0a09221c 100644
--- a/docs/reference/is_disk.frame.html
+++ b/docs/reference/is_disk.frame.html
@@ -17,7 +17,7 @@
       </button>
       <span class="navbar-brand">
         <a class="navbar-link" href="../index.html">disk.frame</a>
-        <span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="">0.6.0</span>
+        <span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="">0.7</span>
       </span>
     </div>
 
diff --git a/docs/reference/join.html b/docs/reference/join.html
index 87b1e852..66d1b402 100644
--- a/docs/reference/join.html
+++ b/docs/reference/join.html
@@ -17,7 +17,7 @@
       </button>
       <span class="navbar-brand">
         <a class="navbar-link" href="../index.html">disk.frame</a>
-        <span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="">0.6.0</span>
+        <span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="">0.7</span>
       </span>
     </div>
 
diff --git a/docs/reference/make_glm_streaming_fn.html b/docs/reference/make_glm_streaming_fn.html
index 63de7ae6..a9e7818c 100644
--- a/docs/reference/make_glm_streaming_fn.html
+++ b/docs/reference/make_glm_streaming_fn.html
@@ -17,7 +17,7 @@
       </button>
       <span class="navbar-brand">
         <a class="navbar-link" href="../index.html">disk.frame</a>
-        <span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="">0.6.0</span>
+        <span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="">0.7</span>
       </span>
     </div>
 
diff --git a/docs/reference/merge.disk.frame.html b/docs/reference/merge.disk.frame.html
index f93a7f53..08f3ff97 100644
--- a/docs/reference/merge.disk.frame.html
+++ b/docs/reference/merge.disk.frame.html
@@ -17,7 +17,7 @@
       </button>
       <span class="navbar-brand">
         <a class="navbar-link" href="../index.html">disk.frame</a>
-        <span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="">0.6.0</span>
+        <span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="">0.7</span>
       </span>
     </div>
 
diff --git a/docs/reference/move_to.html b/docs/reference/move_to.html
index d4211c0b..b28af7b5 100644
--- a/docs/reference/move_to.html
+++ b/docs/reference/move_to.html
@@ -17,7 +17,7 @@
       </button>
       <span class="navbar-brand">
         <a class="navbar-link" href="../index.html">disk.frame</a>
-        <span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="">0.6.0</span>
+        <span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="">0.7</span>
       </span>
     </div>
 
diff --git a/docs/reference/nchunks.html b/docs/reference/nchunks.html
index 9321954a..f9a52a07 100644
--- a/docs/reference/nchunks.html
+++ b/docs/reference/nchunks.html
@@ -17,7 +17,7 @@
       </button>
       <span class="navbar-brand">
         <a class="navbar-link" href="../index.html">disk.frame</a>
-        <span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="">0.6.0</span>
+        <span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="">0.7</span>
       </span>
     </div>
 
diff --git a/docs/reference/ncol_nrow.html b/docs/reference/ncol_nrow.html
index 5b5d239a..b5d845bb 100644
--- a/docs/reference/ncol_nrow.html
+++ b/docs/reference/ncol_nrow.html
@@ -17,7 +17,7 @@
       </button>
       <span class="navbar-brand">
         <a class="navbar-link" href="../index.html">disk.frame</a>
-        <span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="">0.6.0</span>
+        <span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="">0.7</span>
       </span>
     </div>
 
diff --git a/docs/reference/one-stage-group-by-verbs.html b/docs/reference/one-stage-group-by-verbs.html
index ac3c5abf..bed4cc3e 100644
--- a/docs/reference/one-stage-group-by-verbs.html
+++ b/docs/reference/one-stage-group-by-verbs.html
@@ -19,7 +19,7 @@
       </button>
       <span class="navbar-brand">
         <a class="navbar-link" href="../index.html">disk.frame</a>
-        <span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="">0.6.0</span>
+        <span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="">0.7</span>
       </span>
     </div>
 
diff --git a/docs/reference/overwrite_check.html b/docs/reference/overwrite_check.html
index a33eaeb4..67a2b150 100644
--- a/docs/reference/overwrite_check.html
+++ b/docs/reference/overwrite_check.html
@@ -17,7 +17,7 @@
       </button>
       <span class="navbar-brand">
         <a class="navbar-link" href="../index.html">disk.frame</a>
-        <span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="">0.6.0</span>
+        <span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="">0.7</span>
       </span>
     </div>
 
diff --git a/docs/reference/partition_filter.html b/docs/reference/partition_filter.html
new file mode 100644
index 00000000..14d62333
--- /dev/null
+++ b/docs/reference/partition_filter.html
@@ -0,0 +1,133 @@
+<!DOCTYPE html>
+<!-- Generated by pkgdown: do not edit by hand --><html lang="en"><head><meta http-equiv="Content-Type" content="text/html; charset=UTF-8"><meta charset="utf-8"><meta http-equiv="X-UA-Compatible" content="IE=edge"><meta name="viewport" content="width=device-width, initial-scale=1.0"><title>Filter the dataset based on folder partitions — partition_filter • disk.frame</title><!-- jquery --><script src="https://cdnjs.cloudflare.com/ajax/libs/jquery/3.4.1/jquery.min.js" integrity="sha256-CSXorXvZcTkaix6Yvo6HppcZGetbYMGWSFlBw8HfCJo=" crossorigin="anonymous"></script><!-- Bootstrap --><link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/twitter-bootstrap/3.4.1/css/bootstrap.min.css" integrity="sha256-bZLfwXAP04zRMK2BjiO8iu9pf4FbLqX6zitd+tIvLhE=" crossorigin="anonymous"><script src="https://cdnjs.cloudflare.com/ajax/libs/twitter-bootstrap/3.4.1/js/bootstrap.min.js" integrity="sha256-nuL8/2cJ5NDSSwnKD8VqreErSWHtnEP9E7AySL+1ev4=" crossorigin="anonymous"></script><!-- bootstrap-toc --><link rel="stylesheet" href="../bootstrap-toc.css"><script src="../bootstrap-toc.js"></script><!-- Font Awesome icons --><link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/5.12.1/css/all.min.css" integrity="sha256-mmgLkCYLUQbXn0B1SRqzHar6dCnv9oZFPEC1g1cwlkk=" crossorigin="anonymous"><link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/5.12.1/css/v4-shims.min.css" integrity="sha256-wZjR52fzng1pJHwx4aV2AO3yyTOXrcDW7jBpJtTwVxw=" crossorigin="anonymous"><!-- clipboard.js --><script src="https://cdnjs.cloudflare.com/ajax/libs/clipboard.js/2.0.6/clipboard.min.js" integrity="sha256-inc5kl9MA1hkeYUt+EC3BhlIgyp/2jDIyBLS6k3UxPI=" crossorigin="anonymous"></script><!-- headroom.js --><script src="https://cdnjs.cloudflare.com/ajax/libs/headroom/0.11.0/headroom.min.js" integrity="sha256-AsUX4SJE1+yuDu5+mAVzJbuYNPHj/WroHuZ8Ir/CkE0=" crossorigin="anonymous"></script><script src="https://cdnjs.cloudflare.com/ajax/libs/headroom/0.11.0/jQuery.headroom.min.js" integrity="sha256-ZX/yNShbjqsohH1k95liqY9Gd8uOiE1S4vZc+9KQ1K4=" crossorigin="anonymous"></script><!-- pkgdown --><link href="../pkgdown.css" rel="stylesheet"><script src="../pkgdown.js"></script><meta property="og:title" content="Filter the dataset based on folder partitions — partition_filter"><meta property="og:description" content="Filter the dataset based on folder partitions"><!-- mathjax --><script src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/MathJax.js" integrity="sha256-nvJJv9wWKEm88qvoQl9ekL2J+k/RWIsaSScxxlsrv8k=" crossorigin="anonymous"></script><script src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/config/TeX-AMS-MML_HTMLorMML.js" integrity="sha256-84DKXVJXs0/F8OTMzX4UR909+jtl4G7SPypPavF+GfA=" crossorigin="anonymous"></script><!--[if lt IE 9]>
+<script src="https://oss.maxcdn.com/html5shiv/3.7.3/html5shiv.min.js"></script>
+<script src="https://oss.maxcdn.com/respond/1.4.2/respond.min.js"></script>
+<![endif]--></head><body data-spy="scroll" data-target="#toc">
+    
+
+    <div class="container template-reference-topic">
+      <header><div class="navbar navbar-default navbar-fixed-top" role="navigation">
+  <div class="container">
+    <div class="navbar-header">
+      <button type="button" class="navbar-toggle collapsed" data-toggle="collapse" data-target="#navbar" aria-expanded="false">
+        <span class="sr-only">Toggle navigation</span>
+        <span class="icon-bar"></span>
+        <span class="icon-bar"></span>
+        <span class="icon-bar"></span>
+      </button>
+      <span class="navbar-brand">
+        <a class="navbar-link" href="../index.html">disk.frame</a>
+        <span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="">0.7</span>
+      </span>
+    </div>
+
+    <div id="navbar" class="navbar-collapse collapse">
+      <ul class="nav navbar-nav"><li>
+  <a href="../reference/index.html">Reference</a>
+</li>
+<li class="dropdown">
+  <a href="#" class="dropdown-toggle" data-toggle="dropdown" role="button" aria-expanded="false">
+    Articles
+     
+    <span class="caret"></span>
+  </a>
+  <ul class="dropdown-menu" role="menu"><li>
+      <a href="../articles/01-intro.html">Preface - The birth of `disk.frame`</a>
+    </li>
+    <li>
+      <a href="../articles/02-intro-disk-frame.html">Quick Start: Basic Operations with nycflights13</a>
+    </li>
+    <li>
+      <a href="../articles/03-concepts.html">Key `{disk.frame}` concepts</a>
+    </li>
+    <li>
+      <a href="../articles/04-ingesting-data.html">Ingesting Data</a>
+    </li>
+    <li>
+      <a href="../articles/05-data-table-syntax.html">Using data.table syntax with disk.frame</a>
+    </li>
+    <li>
+      <a href="../articles/06-vs-dask-juliadb.html">Benchmarks 1: disk.frame beats Dask! disk.frame beats JuliaDB! Anyone else wanna challenge?</a>
+    </li>
+    <li>
+      <a href="../articles/07-glm.html">Generalized Linear Models (GLM) including logistic regression with disk.frame</a>
+    </li>
+    <li>
+      <a href="../articles/08-more-epic.html">{disk.frame} can be more 'epic'</a>
+    </li>
+    <li>
+      <a href="../articles/09-convenience-features.html">Convenience features</a>
+    </li>
+    <li>
+      <a href="../articles/10-group-by.html">Group-by</a>
+    </li>
+    <li>
+      <a href="../articles/11-custom-group-by.html">Custom One-Stage Group-by functions</a>
+    </li>
+    <li>
+      <a href="../articles/88-trouble-shooting.html">Trouble shooting</a>
+    </li>
+  </ul></li>
+<li>
+  <a href="../news/index.html">Changelog</a>
+</li>
+      </ul><ul class="nav navbar-nav navbar-right"><li>
+  <a href="https://github.com/xiaodaigh/disk.frame/" class="external-link">
+    <span class="fab fa-github fa-lg"></span>
+     
+  </a>
+</li>
+      </ul></div><!--/.nav-collapse -->
+  </div><!--/.container -->
+</div><!--/.navbar -->
+
+      
+
+      </header><div class="row">
+  <div class="col-md-9 contents">
+    <div class="page-header">
+    <h1>Filter the dataset based on folder partitions</h1>
+    <small class="dont-index">Source: <a href="https://github.com/xiaodaigh/disk.frame/blob/HEAD/R/partition-filter.r" class="external-link"><code>R/partition-filter.r</code></a></small>
+    <div class="hidden name"><code>partition_filter.Rd</code></div>
+    </div>
+
+    <div class="ref-description">
+    <p>Filter the dataset based on folder partitions</p>
+    </div>
+
+    <div id="ref-usage">
+    <div class="sourceCode"><pre class="sourceCode r"><code><span class="fu">partition_filter</span><span class="op">(</span><span class="va">x</span>, <span class="va">...</span><span class="op">)</span></code></pre></div>
+    </div>
+
+    <div id="arguments">
+    <h2>Arguments</h2>
+    <dl><dt>x</dt>
+<dd><p>a disk.frame</p></dd>
+<dt>...</dt>
+<dd><p>filtering conditions for filtering the disk.frame at (folder) partition level</p></dd>
+</dl></div>
+
+  </div>
+  <div class="col-md-3 hidden-xs hidden-sm" id="pkgdown-sidebar">
+    <nav id="toc" data-toggle="toc" class="sticky-top"><h2 data-toc-skip>Contents</h2>
+    </nav></div>
+</div>
+
+
+      <footer><div class="copyright">
+  <p></p><p>Developed by Dai ZJ.</p>
+</div>
+
+<div class="pkgdown">
+  <p></p><p>Site built with <a href="https://pkgdown.r-lib.org/" class="external-link">pkgdown</a> 2.0.2.</p>
+</div>
+
+      </footer></div>
+
+  
+
+
+  
+
+  </body></html>
+
diff --git a/docs/reference/play.html b/docs/reference/play.html
index 286b1855..1fd2f784 100644
--- a/docs/reference/play.html
+++ b/docs/reference/play.html
@@ -17,7 +17,7 @@
       </button>
       <span class="navbar-brand">
         <a class="navbar-link" href="../index.html">disk.frame</a>
-        <span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="">0.6.0</span>
+        <span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="">0.7</span>
       </span>
     </div>
 
diff --git a/docs/reference/print.disk.frame.html b/docs/reference/print.disk.frame.html
index 76b587e8..d39140bf 100644
--- a/docs/reference/print.disk.frame.html
+++ b/docs/reference/print.disk.frame.html
@@ -17,7 +17,7 @@
       </button>
       <span class="navbar-brand">
         <a class="navbar-link" href="../index.html">disk.frame</a>
-        <span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="">0.6.0</span>
+        <span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="">0.7</span>
       </span>
     </div>
 
diff --git a/docs/reference/pull.disk.frame.html b/docs/reference/pull.disk.frame.html
index 172cb910..41b02d1a 100644
--- a/docs/reference/pull.disk.frame.html
+++ b/docs/reference/pull.disk.frame.html
@@ -17,7 +17,7 @@
       </button>
       <span class="navbar-brand">
         <a class="navbar-link" href="../index.html">disk.frame</a>
-        <span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="">0.6.0</span>
+        <span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="">0.7</span>
       </span>
     </div>
 
diff --git a/docs/reference/purrr_as_mapper.html b/docs/reference/purrr_as_mapper.html
index 4e334c11..5a118b61 100644
--- a/docs/reference/purrr_as_mapper.html
+++ b/docs/reference/purrr_as_mapper.html
@@ -17,7 +17,7 @@
       </button>
       <span class="navbar-brand">
         <a class="navbar-link" href="../index.html">disk.frame</a>
-        <span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="">0.6.0</span>
+        <span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="">0.7</span>
       </span>
     </div>
 
diff --git a/docs/reference/rbindlist.disk.frame.html b/docs/reference/rbindlist.disk.frame.html
index b50ff95f..4ca1ab33 100644
--- a/docs/reference/rbindlist.disk.frame.html
+++ b/docs/reference/rbindlist.disk.frame.html
@@ -17,7 +17,7 @@
       </button>
       <span class="navbar-brand">
         <a class="navbar-link" href="../index.html">disk.frame</a>
-        <span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="">0.6.0</span>
+        <span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="">0.7</span>
       </span>
     </div>
 
diff --git a/docs/reference/rechunk.html b/docs/reference/rechunk.html
index b5f2b78c..1e7fc40a 100644
--- a/docs/reference/rechunk.html
+++ b/docs/reference/rechunk.html
@@ -17,7 +17,7 @@
       </button>
       <span class="navbar-brand">
         <a class="navbar-link" href="../index.html">disk.frame</a>
-        <span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="">0.6.0</span>
+        <span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="">0.7</span>
       </span>
     </div>
 
@@ -126,8 +126,8 @@ <h2>Examples</h2>
 <span class="r-in"></span>
 <span class="r-in"><span class="co"># re-chunking cars.df to 3 chunks, done "in-place" to the same folder as cars.df</span></span>
 <span class="r-in"><span class="fu">rechunk</span><span class="op">(</span><span class="va">cars.df</span>, <span class="fl">3</span><span class="op">)</span></span>
-<span class="r-msg co"><span class="r-pr">#&gt;</span> files have been backed up to temporary dir C:\Users\RTX2080\AppData\Local\Temp\RtmpyknGIm\back_up_tmp_dir471815f02658. You can recover there files until you restart your R session</span>
-<span class="r-msg co"><span class="r-pr">#&gt;</span> path: "C:\Users\RTX2080\AppData\Local\Temp\RtmpyknGIm\file471862cf5821.df"</span>
+<span class="r-msg co"><span class="r-pr">#&gt;</span> files have been backed up to temporary dir C:\Users\RTX2080\AppData\Local\Temp\RtmpUX5lOH\back_up_tmp_dir1c382d2e21d6. You can recover there files until you restart your R session</span>
+<span class="r-msg co"><span class="r-pr">#&gt;</span> path: "C:\Users\RTX2080\AppData\Local\Temp\RtmpUX5lOH\file1c3824aa62c4.df"</span>
 <span class="r-msg co"><span class="r-pr">#&gt;</span> nchunks: 3</span>
 <span class="r-msg co"><span class="r-pr">#&gt;</span> nrow (at source): 50</span>
 <span class="r-msg co"><span class="r-pr">#&gt;</span> ncol (at source): 2</span>
diff --git a/docs/reference/recommend_nchunks.html b/docs/reference/recommend_nchunks.html
index dd1c227b..f1a9a12e 100644
--- a/docs/reference/recommend_nchunks.html
+++ b/docs/reference/recommend_nchunks.html
@@ -18,7 +18,7 @@
       </button>
       <span class="navbar-brand">
         <a class="navbar-link" href="../index.html">disk.frame</a>
-        <span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="">0.6.0</span>
+        <span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="">0.7</span>
       </span>
     </div>
 
diff --git a/docs/reference/remove_chunk.html b/docs/reference/remove_chunk.html
index 232f00e1..8fbb459d 100644
--- a/docs/reference/remove_chunk.html
+++ b/docs/reference/remove_chunk.html
@@ -17,7 +17,7 @@
       </button>
       <span class="navbar-brand">
         <a class="navbar-link" href="../index.html">disk.frame</a>
-        <span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="">0.6.0</span>
+        <span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="">0.7</span>
       </span>
     </div>
 
@@ -116,7 +116,7 @@ <h2>Examples</h2>
 <span class="r-in"></span>
 <span class="r-in"><span class="co"># removes 3rd chunk</span></span>
 <span class="r-in"><span class="fu">remove_chunk</span><span class="op">(</span><span class="va">cars.df</span>, <span class="fl">3</span><span class="op">)</span></span>
-<span class="r-msg co"><span class="r-pr">#&gt;</span> path: "C:\Users\RTX2080\AppData\Local\Temp\RtmpyknGIm\file47184b42308d.df"</span>
+<span class="r-msg co"><span class="r-pr">#&gt;</span> path: "C:\Users\RTX2080\AppData\Local\Temp\RtmpUX5lOH\file1c3821d32360.df"</span>
 <span class="r-msg co"><span class="r-pr">#&gt;</span> nchunks: 3</span>
 <span class="r-msg co"><span class="r-pr">#&gt;</span> nrow (at source): 37</span>
 <span class="r-msg co"><span class="r-pr">#&gt;</span> ncol (at source): 2</span>
@@ -125,7 +125,7 @@ <h2>Examples</h2>
 <span class="r-in"></span>
 <span class="r-in"><span class="co"># removes 4th chunk</span></span>
 <span class="r-in"><span class="fu">remove_chunk</span><span class="op">(</span><span class="va">cars.df</span>, <span class="st">"4.fst"</span><span class="op">)</span></span>
-<span class="r-msg co"><span class="r-pr">#&gt;</span> path: "C:\Users\RTX2080\AppData\Local\Temp\RtmpyknGIm\file47184b42308d.df"</span>
+<span class="r-msg co"><span class="r-pr">#&gt;</span> path: "C:\Users\RTX2080\AppData\Local\Temp\RtmpUX5lOH\file1c3821d32360.df"</span>
 <span class="r-msg co"><span class="r-pr">#&gt;</span> nchunks: 2</span>
 <span class="r-msg co"><span class="r-pr">#&gt;</span> nrow (at source): 26</span>
 <span class="r-msg co"><span class="r-pr">#&gt;</span> ncol (at source): 2</span>
@@ -134,7 +134,7 @@ <h2>Examples</h2>
 <span class="r-in"></span>
 <span class="r-in"><span class="co"># removes 2nd chunk</span></span>
 <span class="r-in"><span class="fu">remove_chunk</span><span class="op">(</span><span class="va">cars.df</span>, <span class="fu"><a href="https://rdrr.io/r/base/file.path.html" class="external-link">file.path</a></span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/base/attr.html" class="external-link">attr</a></span><span class="op">(</span><span class="va">cars.df</span>, <span class="st">"path"</span>, exact<span class="op">=</span><span class="cn">TRUE</span><span class="op">)</span>, <span class="st">"2.fst"</span><span class="op">)</span>, full.names <span class="op">=</span> <span class="cn">TRUE</span><span class="op">)</span></span>
-<span class="r-msg co"><span class="r-pr">#&gt;</span> path: "C:\Users\RTX2080\AppData\Local\Temp\RtmpyknGIm\file47184b42308d.df"</span>
+<span class="r-msg co"><span class="r-pr">#&gt;</span> path: "C:\Users\RTX2080\AppData\Local\Temp\RtmpUX5lOH\file1c3821d32360.df"</span>
 <span class="r-msg co"><span class="r-pr">#&gt;</span> nchunks: 1</span>
 <span class="r-msg co"><span class="r-pr">#&gt;</span> nrow (at source): 13</span>
 <span class="r-msg co"><span class="r-pr">#&gt;</span> ncol (at source): 2</span>
diff --git a/docs/reference/sample.html b/docs/reference/sample.html
index 056cef09..45c890af 100644
--- a/docs/reference/sample.html
+++ b/docs/reference/sample.html
@@ -17,7 +17,7 @@
       </button>
       <span class="navbar-brand">
         <a class="navbar-link" href="../index.html">disk.frame</a>
-        <span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="">0.6.0</span>
+        <span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="">0.7</span>
       </span>
     </div>
 
@@ -127,28 +127,28 @@ <h2>Examples</h2>
 <span class="r-in"></span>
 <span class="r-in"><span class="fu"><a href="https://dplyr.tidyverse.org/reference/compute.html" class="external-link">collect</a></span><span class="op">(</span><span class="fu"><a href="https://dplyr.tidyverse.org/reference/sample_n.html" class="external-link">sample_frac</a></span><span class="op">(</span><span class="va">cars.df</span>, <span class="fl">0.5</span><span class="op">)</span><span class="op">)</span></span>
 <span class="r-out co"><span class="r-pr">#&gt;</span>     speed dist</span>
-<span class="r-out co"><span class="r-pr">#&gt;</span>  1:     7    4</span>
-<span class="r-out co"><span class="r-pr">#&gt;</span>  2:     4    2</span>
-<span class="r-out co"><span class="r-pr">#&gt;</span>  3:    10   18</span>
-<span class="r-out co"><span class="r-pr">#&gt;</span>  4:    10   34</span>
+<span class="r-out co"><span class="r-pr">#&gt;</span>  1:     7   22</span>
+<span class="r-out co"><span class="r-pr">#&gt;</span>  2:    10   18</span>
+<span class="r-out co"><span class="r-pr">#&gt;</span>  3:     9   10</span>
+<span class="r-out co"><span class="r-pr">#&gt;</span>  4:     8   16</span>
 <span class="r-out co"><span class="r-pr">#&gt;</span>  5:    11   17</span>
 <span class="r-out co"><span class="r-pr">#&gt;</span>  6:    13   34</span>
-<span class="r-out co"><span class="r-pr">#&gt;</span>  7:    13   26</span>
-<span class="r-out co"><span class="r-pr">#&gt;</span>  8:    12   28</span>
-<span class="r-out co"><span class="r-pr">#&gt;</span>  9:    15   26</span>
-<span class="r-out co"><span class="r-pr">#&gt;</span> 10:    13   46</span>
-<span class="r-out co"><span class="r-pr">#&gt;</span> 11:    16   32</span>
-<span class="r-out co"><span class="r-pr">#&gt;</span> 12:    15   20</span>
-<span class="r-out co"><span class="r-pr">#&gt;</span> 13:    18   42</span>
-<span class="r-out co"><span class="r-pr">#&gt;</span> 14:    17   50</span>
-<span class="r-out co"><span class="r-pr">#&gt;</span> 15:    19   36</span>
-<span class="r-out co"><span class="r-pr">#&gt;</span> 16:    18   76</span>
-<span class="r-out co"><span class="r-pr">#&gt;</span> 17:    20   56</span>
-<span class="r-out co"><span class="r-pr">#&gt;</span> 18:    20   64</span>
-<span class="r-out co"><span class="r-pr">#&gt;</span> 19:    23   54</span>
-<span class="r-out co"><span class="r-pr">#&gt;</span> 20:    20   52</span>
-<span class="r-out co"><span class="r-pr">#&gt;</span> 21:    24   93</span>
-<span class="r-out co"><span class="r-pr">#&gt;</span> 22:    25   85</span>
+<span class="r-out co"><span class="r-pr">#&gt;</span>  7:    13   34</span>
+<span class="r-out co"><span class="r-pr">#&gt;</span>  8:    12   14</span>
+<span class="r-out co"><span class="r-pr">#&gt;</span>  9:    14   80</span>
+<span class="r-out co"><span class="r-pr">#&gt;</span> 10:    15   54</span>
+<span class="r-out co"><span class="r-pr">#&gt;</span> 11:    14   36</span>
+<span class="r-out co"><span class="r-pr">#&gt;</span> 12:    16   32</span>
+<span class="r-out co"><span class="r-pr">#&gt;</span> 13:    17   50</span>
+<span class="r-out co"><span class="r-pr">#&gt;</span> 14:    18   42</span>
+<span class="r-out co"><span class="r-pr">#&gt;</span> 15:    18   56</span>
+<span class="r-out co"><span class="r-pr">#&gt;</span> 16:    18   84</span>
+<span class="r-out co"><span class="r-pr">#&gt;</span> 17:    20   52</span>
+<span class="r-out co"><span class="r-pr">#&gt;</span> 18:    20   32</span>
+<span class="r-out co"><span class="r-pr">#&gt;</span> 19:    20   64</span>
+<span class="r-out co"><span class="r-pr">#&gt;</span> 20:    19   46</span>
+<span class="r-out co"><span class="r-pr">#&gt;</span> 21:    24   92</span>
+<span class="r-out co"><span class="r-pr">#&gt;</span> 22:    24   70</span>
 <span class="r-out co"><span class="r-pr">#&gt;</span>     speed dist</span>
 <span class="r-in"></span>
 <span class="r-in"><span class="co"># clean up cars.df</span></span>
diff --git a/docs/reference/setup_disk.frame.html b/docs/reference/setup_disk.frame.html
index 6d502bac..a2994988 100644
--- a/docs/reference/setup_disk.frame.html
+++ b/docs/reference/setup_disk.frame.html
@@ -17,7 +17,7 @@
       </button>
       <span class="navbar-brand">
         <a class="navbar-link" href="../index.html">disk.frame</a>
-        <span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="">0.6.0</span>
+        <span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="">0.7</span>
       </span>
     </div>
 
diff --git a/docs/reference/shard.html b/docs/reference/shard.html
index 7b598974..f419f59c 100644
--- a/docs/reference/shard.html
+++ b/docs/reference/shard.html
@@ -18,7 +18,7 @@
       </button>
       <span class="navbar-brand">
         <a class="navbar-link" href="../index.html">disk.frame</a>
-        <span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="">0.6.0</span>
+        <span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="">0.7</span>
       </span>
     </div>
 
diff --git a/docs/reference/shardkey.html b/docs/reference/shardkey.html
index 2fce89e2..f4553236 100644
--- a/docs/reference/shardkey.html
+++ b/docs/reference/shardkey.html
@@ -17,7 +17,7 @@
       </button>
       <span class="navbar-brand">
         <a class="navbar-link" href="../index.html">disk.frame</a>
-        <span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="">0.6.0</span>
+        <span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="">0.7</span>
       </span>
     </div>
 
diff --git a/docs/reference/shardkey_equal.html b/docs/reference/shardkey_equal.html
index dbc9bcce..7b013cd6 100644
--- a/docs/reference/shardkey_equal.html
+++ b/docs/reference/shardkey_equal.html
@@ -17,7 +17,7 @@
       </button>
       <span class="navbar-brand">
         <a class="navbar-link" href="../index.html">disk.frame</a>
-        <span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="">0.6.0</span>
+        <span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="">0.7</span>
       </span>
     </div>
 
diff --git a/docs/reference/show_ceremony.html b/docs/reference/show_ceremony.html
index d1d69c51..8df3ffd8 100644
--- a/docs/reference/show_ceremony.html
+++ b/docs/reference/show_ceremony.html
@@ -17,7 +17,7 @@
       </button>
       <span class="navbar-brand">
         <a class="navbar-link" href="../index.html">disk.frame</a>
-        <span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="">0.6.0</span>
+        <span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="">0.7</span>
       </span>
     </div>
 
diff --git a/docs/reference/split_string_into_df.html b/docs/reference/split_string_into_df.html
new file mode 100644
index 00000000..9a65c61c
--- /dev/null
+++ b/docs/reference/split_string_into_df.html
@@ -0,0 +1,131 @@
+<!DOCTYPE html>
+<!-- Generated by pkgdown: do not edit by hand --><html lang="en"><head><meta http-equiv="Content-Type" content="text/html; charset=UTF-8"><meta charset="utf-8"><meta http-equiv="X-UA-Compatible" content="IE=edge"><meta name="viewport" content="width=device-width, initial-scale=1.0"><title>Turn a string of the form /partion1=val/partion2=val2 into data.frame — split_string_into_df • disk.frame</title><!-- jquery --><script src="https://cdnjs.cloudflare.com/ajax/libs/jquery/3.4.1/jquery.min.js" integrity="sha256-CSXorXvZcTkaix6Yvo6HppcZGetbYMGWSFlBw8HfCJo=" crossorigin="anonymous"></script><!-- Bootstrap --><link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/twitter-bootstrap/3.4.1/css/bootstrap.min.css" integrity="sha256-bZLfwXAP04zRMK2BjiO8iu9pf4FbLqX6zitd+tIvLhE=" crossorigin="anonymous"><script src="https://cdnjs.cloudflare.com/ajax/libs/twitter-bootstrap/3.4.1/js/bootstrap.min.js" integrity="sha256-nuL8/2cJ5NDSSwnKD8VqreErSWHtnEP9E7AySL+1ev4=" crossorigin="anonymous"></script><!-- bootstrap-toc --><link rel="stylesheet" href="../bootstrap-toc.css"><script src="../bootstrap-toc.js"></script><!-- Font Awesome icons --><link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/5.12.1/css/all.min.css" integrity="sha256-mmgLkCYLUQbXn0B1SRqzHar6dCnv9oZFPEC1g1cwlkk=" crossorigin="anonymous"><link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/5.12.1/css/v4-shims.min.css" integrity="sha256-wZjR52fzng1pJHwx4aV2AO3yyTOXrcDW7jBpJtTwVxw=" crossorigin="anonymous"><!-- clipboard.js --><script src="https://cdnjs.cloudflare.com/ajax/libs/clipboard.js/2.0.6/clipboard.min.js" integrity="sha256-inc5kl9MA1hkeYUt+EC3BhlIgyp/2jDIyBLS6k3UxPI=" crossorigin="anonymous"></script><!-- headroom.js --><script src="https://cdnjs.cloudflare.com/ajax/libs/headroom/0.11.0/headroom.min.js" integrity="sha256-AsUX4SJE1+yuDu5+mAVzJbuYNPHj/WroHuZ8Ir/CkE0=" crossorigin="anonymous"></script><script src="https://cdnjs.cloudflare.com/ajax/libs/headroom/0.11.0/jQuery.headroom.min.js" integrity="sha256-ZX/yNShbjqsohH1k95liqY9Gd8uOiE1S4vZc+9KQ1K4=" crossorigin="anonymous"></script><!-- pkgdown --><link href="../pkgdown.css" rel="stylesheet"><script src="../pkgdown.js"></script><meta property="og:title" content="Turn a string of the form /partion1=val/partion2=val2 into data.frame — split_string_into_df"><meta property="og:description" content="Turn a string of the form /partion1=val/partion2=val2 into data.frame"><!-- mathjax --><script src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/MathJax.js" integrity="sha256-nvJJv9wWKEm88qvoQl9ekL2J+k/RWIsaSScxxlsrv8k=" crossorigin="anonymous"></script><script src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/config/TeX-AMS-MML_HTMLorMML.js" integrity="sha256-84DKXVJXs0/F8OTMzX4UR909+jtl4G7SPypPavF+GfA=" crossorigin="anonymous"></script><!--[if lt IE 9]>
+<script src="https://oss.maxcdn.com/html5shiv/3.7.3/html5shiv.min.js"></script>
+<script src="https://oss.maxcdn.com/respond/1.4.2/respond.min.js"></script>
+<![endif]--></head><body data-spy="scroll" data-target="#toc">
+    
+
+    <div class="container template-reference-topic">
+      <header><div class="navbar navbar-default navbar-fixed-top" role="navigation">
+  <div class="container">
+    <div class="navbar-header">
+      <button type="button" class="navbar-toggle collapsed" data-toggle="collapse" data-target="#navbar" aria-expanded="false">
+        <span class="sr-only">Toggle navigation</span>
+        <span class="icon-bar"></span>
+        <span class="icon-bar"></span>
+        <span class="icon-bar"></span>
+      </button>
+      <span class="navbar-brand">
+        <a class="navbar-link" href="../index.html">disk.frame</a>
+        <span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="">0.7</span>
+      </span>
+    </div>
+
+    <div id="navbar" class="navbar-collapse collapse">
+      <ul class="nav navbar-nav"><li>
+  <a href="../reference/index.html">Reference</a>
+</li>
+<li class="dropdown">
+  <a href="#" class="dropdown-toggle" data-toggle="dropdown" role="button" aria-expanded="false">
+    Articles
+     
+    <span class="caret"></span>
+  </a>
+  <ul class="dropdown-menu" role="menu"><li>
+      <a href="../articles/01-intro.html">Preface - The birth of `disk.frame`</a>
+    </li>
+    <li>
+      <a href="../articles/02-intro-disk-frame.html">Quick Start: Basic Operations with nycflights13</a>
+    </li>
+    <li>
+      <a href="../articles/03-concepts.html">Key `{disk.frame}` concepts</a>
+    </li>
+    <li>
+      <a href="../articles/04-ingesting-data.html">Ingesting Data</a>
+    </li>
+    <li>
+      <a href="../articles/05-data-table-syntax.html">Using data.table syntax with disk.frame</a>
+    </li>
+    <li>
+      <a href="../articles/06-vs-dask-juliadb.html">Benchmarks 1: disk.frame beats Dask! disk.frame beats JuliaDB! Anyone else wanna challenge?</a>
+    </li>
+    <li>
+      <a href="../articles/07-glm.html">Generalized Linear Models (GLM) including logistic regression with disk.frame</a>
+    </li>
+    <li>
+      <a href="../articles/08-more-epic.html">{disk.frame} can be more 'epic'</a>
+    </li>
+    <li>
+      <a href="../articles/09-convenience-features.html">Convenience features</a>
+    </li>
+    <li>
+      <a href="../articles/10-group-by.html">Group-by</a>
+    </li>
+    <li>
+      <a href="../articles/11-custom-group-by.html">Custom One-Stage Group-by functions</a>
+    </li>
+    <li>
+      <a href="../articles/88-trouble-shooting.html">Trouble shooting</a>
+    </li>
+  </ul></li>
+<li>
+  <a href="../news/index.html">Changelog</a>
+</li>
+      </ul><ul class="nav navbar-nav navbar-right"><li>
+  <a href="https://github.com/xiaodaigh/disk.frame/" class="external-link">
+    <span class="fab fa-github fa-lg"></span>
+     
+  </a>
+</li>
+      </ul></div><!--/.nav-collapse -->
+  </div><!--/.container -->
+</div><!--/.navbar -->
+
+      
+
+      </header><div class="row">
+  <div class="col-md-9 contents">
+    <div class="page-header">
+    <h1>Turn a string of the form /partion1=val/partion2=val2 into data.frame</h1>
+    <small class="dont-index">Source: <a href="https://github.com/xiaodaigh/disk.frame/blob/HEAD/R/get_partition.r" class="external-link"><code>R/get_partition.r</code></a></small>
+    <div class="hidden name"><code>split_string_into_df.Rd</code></div>
+    </div>
+
+    <div class="ref-description">
+    <p>Turn a string of the form /partion1=val/partion2=val2 into data.frame</p>
+    </div>
+
+    <div id="ref-usage">
+    <div class="sourceCode"><pre class="sourceCode r"><code><span class="fu">split_string_into_df</span><span class="op">(</span><span class="va">path_strs</span><span class="op">)</span></code></pre></div>
+    </div>
+
+    <div id="arguments">
+    <h2>Arguments</h2>
+    <dl><dt>path_strs</dt>
+<dd><p>The paths in string form to break into partition format</p></dd>
+</dl></div>
+
+  </div>
+  <div class="col-md-3 hidden-xs hidden-sm" id="pkgdown-sidebar">
+    <nav id="toc" data-toggle="toc" class="sticky-top"><h2 data-toc-skip>Contents</h2>
+    </nav></div>
+</div>
+
+
+      <footer><div class="copyright">
+  <p></p><p>Developed by Dai ZJ.</p>
+</div>
+
+<div class="pkgdown">
+  <p></p><p>Site built with <a href="https://pkgdown.r-lib.org/" class="external-link">pkgdown</a> 2.0.2.</p>
+</div>
+
+      </footer></div>
+
+  
+
+
+  
+
+  </body></html>
+
diff --git a/docs/reference/srckeep.html b/docs/reference/srckeep.html
index 8bfaee95..5b927dbd 100644
--- a/docs/reference/srckeep.html
+++ b/docs/reference/srckeep.html
@@ -17,7 +17,7 @@
       </button>
       <span class="navbar-brand">
         <a class="navbar-link" href="../index.html">disk.frame</a>
-        <span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="">0.6.0</span>
+        <span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="">0.7</span>
       </span>
     </div>
 
diff --git a/docs/reference/sub-.disk.frame.html b/docs/reference/sub-.disk.frame.html
index 3c758a26..9023b0d1 100644
--- a/docs/reference/sub-.disk.frame.html
+++ b/docs/reference/sub-.disk.frame.html
@@ -17,7 +17,7 @@
       </button>
       <span class="navbar-brand">
         <a class="navbar-link" href="../index.html">disk.frame</a>
-        <span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="">0.6.0</span>
+        <span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="">0.7</span>
       </span>
     </div>
 
diff --git a/docs/reference/tbl_vars.disk.frame.html b/docs/reference/tbl_vars.disk.frame.html
index 1df5aeaf..6a51358a 100644
--- a/docs/reference/tbl_vars.disk.frame.html
+++ b/docs/reference/tbl_vars.disk.frame.html
@@ -18,7 +18,7 @@
       </button>
       <span class="navbar-brand">
         <a class="navbar-link" href="../index.html">disk.frame</a>
-        <span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="">0.6.0</span>
+        <span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="">0.7</span>
       </span>
     </div>
 
diff --git a/docs/reference/write_disk.frame.html b/docs/reference/write_disk.frame.html
index 6eda846a..b57289cc 100644
--- a/docs/reference/write_disk.frame.html
+++ b/docs/reference/write_disk.frame.html
@@ -18,7 +18,7 @@
       </button>
       <span class="navbar-brand">
         <a class="navbar-link" href="../index.html">disk.frame</a>
-        <span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="">0.6.0</span>
+        <span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="">0.7</span>
       </span>
     </div>
 
@@ -105,10 +105,8 @@ <h1>Write disk.frame to disk</h1>
     <span class="fu"><a href="recommend_nchunks.html">recommend_nchunks</a></span><span class="op">(</span><span class="va">diskf</span><span class="op">)</span><span class="op">)</span>,
   overwrite <span class="op">=</span> <span class="cn">FALSE</span>,
   shardby <span class="op">=</span> <span class="cn">NULL</span>,
+  partitionby <span class="op">=</span> <span class="cn">NULL</span>,
   compress <span class="op">=</span> <span class="fl">50</span>,
-  shardby_function <span class="op">=</span> <span class="st">"hash"</span>,
-  sort_splits <span class="op">=</span> <span class="cn">NULL</span>,
-  desc_vars <span class="op">=</span> <span class="cn">NULL</span>,
   <span class="va">...</span>
 <span class="op">)</span>
 
@@ -127,14 +125,10 @@ <h2>Arguments</h2>
 <dd><p>overwrite output directory</p></dd>
 <dt>shardby</dt>
 <dd><p>the columns to shard by</p></dd>
+<dt>partitionby</dt>
+<dd><p>the columns to (folder) partition by</p></dd>
 <dt>compress</dt>
 <dd><p>compression ratio for fst files</p></dd>
-<dt>shardby_function</dt>
-<dd><p>splitting of chunks: "hash" for hash function or "sort" for semi-sorted chunks</p></dd>
-<dt>sort_splits</dt>
-<dd><p>for the "sort" shardby function, a dataframe with the split values.</p></dd>
-<dt>desc_vars</dt>
-<dd><p>for the "sort" shardby function, the variables to sort descending.</p></dd>
 <dt>...</dt>
 <dd><p>passed to cmap.disk.frame</p></dd>
 </dl></div>
diff --git a/docs/reference/zip_to_disk.frame.html b/docs/reference/zip_to_disk.frame.html
index fe4335d3..9a170ce9 100644
--- a/docs/reference/zip_to_disk.frame.html
+++ b/docs/reference/zip_to_disk.frame.html
@@ -20,7 +20,7 @@
       </button>
       <span class="navbar-brand">
         <a class="navbar-link" href="../index.html">disk.frame</a>
-        <span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="">0.6.0</span>
+        <span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="">0.7</span>
       </span>
     </div>
 
@@ -144,7 +144,7 @@ <h2>Examples</h2>
 <span class="r-in"></span>
 <span class="r-in"><span class="co"># read every file and convert it to a disk.frame</span></span>
 <span class="r-in"><span class="va">zip.df</span> <span class="op">=</span> <span class="fu">zip_to_disk.frame</span><span class="op">(</span><span class="va">zipfile</span>, <span class="fu"><a href="https://rdrr.io/r/base/tempfile.html" class="external-link">tempfile</a></span><span class="op">(</span>fileext <span class="op">=</span> <span class="st">".df"</span><span class="op">)</span><span class="op">)</span></span>
-<span class="r-err co"><span class="r-pr">#&gt;</span> <span class="error">Error in unzip(zipfile, list = TRUE):</span> zip file 'C:\Users\RTX2080\AppData\Local\Temp\RtmpyknGIm\file471855d8254c.zip' cannot be opened</span>
+<span class="r-err co"><span class="r-pr">#&gt;</span> <span class="error">Error in unzip(zipfile, list = TRUE):</span> zip file 'C:\Users\RTX2080\AppData\Local\Temp\RtmpUX5lOH\file1c3872483ca5.zip' cannot be opened</span>
 <span class="r-in"></span>
 <span class="r-in"><span class="co"># there is only one csv file so it return a list of one disk.frame</span></span>
 <span class="r-in"><span class="va">zip.df</span><span class="op">[[</span><span class="fl">1</span><span class="op">]</span><span class="op">]</span></span>
diff --git a/docs/sitemap.xml b/docs/sitemap.xml
index 5df92161..b50973ee 100644
--- a/docs/sitemap.xml
+++ b/docs/sitemap.xml
@@ -171,6 +171,9 @@
   <url>
     <loc>/reference/evalparseglue.html</loc>
   </url>
+  <url>
+    <loc>/reference/find_globals_recursively.html</loc>
+  </url>
   <url>
     <loc>/reference/foverlaps.disk.frame.html</loc>
   </url>
@@ -183,6 +186,9 @@
   <url>
     <loc>/reference/get_chunk_ids.html</loc>
   </url>
+  <url>
+    <loc>/reference/get_partition_paths.html</loc>
+  </url>
   <url>
     <loc>/reference/groups.disk.frame.html</loc>
   </url>
@@ -240,6 +246,9 @@
   <url>
     <loc>/reference/overwrite_check.html</loc>
   </url>
+  <url>
+    <loc>/reference/partition_filter.html</loc>
+  </url>
   <url>
     <loc>/reference/play.html</loc>
   </url>
@@ -282,6 +291,9 @@
   <url>
     <loc>/reference/show_ceremony.html</loc>
   </url>
+  <url>
+    <loc>/reference/split_string_into_df.html</loc>
+  </url>
   <url>
     <loc>/reference/srckeep.html</loc>
   </url>
diff --git a/man/disk.frame_to_parquet.Rd b/man/disk.frame_to_parquet.Rd
new file mode 100644
index 00000000..559a9a7f
--- /dev/null
+++ b/man/disk.frame_to_parquet.Rd
@@ -0,0 +1,16 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/disk.frame-to-parquet.r
+\name{disk.frame_to_parquet}
+\alias{disk.frame_to_parquet}
+\title{A function to convert a disk.frame to parquet format}
+\usage{
+disk.frame_to_parquet(df, outdir)
+}
+\arguments{
+\item{df}{a disk.frame or a path to a disk.frame}
+
+\item{outdir}{the path to save the parquet files}
+}
+\description{
+A function to convert a disk.frame to parquet format
+}
diff --git a/man/find_globals_recursively.Rd b/man/find_globals_recursively.Rd
new file mode 100644
index 00000000..2d7bc7fb
--- /dev/null
+++ b/man/find_globals_recursively.Rd
@@ -0,0 +1,16 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/util.r
+\name{find_globals_recursively}
+\alias{find_globals_recursively}
+\title{Find globals in an expression by searching through the chain}
+\usage{
+find_globals_recursively(code, envir)
+}
+\arguments{
+\item{code}{An expression to search for globals}
+
+\item{envir}{The environment from which to begin the search}
+}
+\description{
+Find globals in an expression by searching through the chain
+}
diff --git a/man/get_chunk.Rd b/man/get_chunk.Rd
index a4b4b796..86463463 100644
--- a/man/get_chunk.Rd
+++ b/man/get_chunk.Rd
@@ -7,7 +7,7 @@
 \usage{
 get_chunk(...)
 
-\method{get_chunk}{disk.frame}(df, n, keep = NULL, full.names = FALSE, ...)
+\method{get_chunk}{disk.frame}(df, n, keep = NULL, full.names = FALSE, ..., partitioned_info = NULL)
 }
 \arguments{
 \item{...}{passed to fst::read_fst or whichever read function is used in the backend}
@@ -19,6 +19,8 @@ get_chunk(...)
 \item{keep}{the columns to keep}
 
 \item{full.names}{whether n is the full path to the chunks or just a relative path file name. Ignored if n is numeric}
+
+\item{partitioned_info}{for internal use only. It's a data.frame used to help with filtering by partitions}
 }
 \description{
 Obtain one chunk by chunk id
diff --git a/man/get_partition_paths.Rd b/man/get_partition_paths.Rd
new file mode 100644
index 00000000..33bb5d39
--- /dev/null
+++ b/man/get_partition_paths.Rd
@@ -0,0 +1,15 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/get_partition.r
+\name{get_partition_paths}
+\alias{get_partition_paths}
+\title{Get the partitioning structure of a folder}
+\usage{
+get_partition_paths(df)
+}
+\arguments{
+\item{df}{a disk.frame whose paths will be used to determine if it's
+folder-partitioned disk.frame}
+}
+\description{
+Get the partitioning structure of a folder
+}
diff --git a/man/partition_filter.Rd b/man/partition_filter.Rd
new file mode 100644
index 00000000..14a93a4e
--- /dev/null
+++ b/man/partition_filter.Rd
@@ -0,0 +1,16 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/partition-filter.r
+\name{partition_filter}
+\alias{partition_filter}
+\title{Filter the dataset based on folder partitions}
+\usage{
+partition_filter(x, ...)
+}
+\arguments{
+\item{x}{a disk.frame}
+
+\item{...}{filtering conditions for filtering the disk.frame at (folder) partition level}
+}
+\description{
+Filter the dataset based on folder partitions
+}
diff --git a/man/split_string_into_df.Rd b/man/split_string_into_df.Rd
new file mode 100644
index 00000000..c18f63c5
--- /dev/null
+++ b/man/split_string_into_df.Rd
@@ -0,0 +1,14 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/get_partition.r
+\name{split_string_into_df}
+\alias{split_string_into_df}
+\title{Turn a string of the form /partion1=val/partion2=val2 into data.frame}
+\usage{
+split_string_into_df(path_strs)
+}
+\arguments{
+\item{path_strs}{The paths in string form to break into partition format}
+}
+\description{
+Turn a string of the form /partion1=val/partion2=val2 into data.frame
+}
diff --git a/man/write_disk.frame.Rd b/man/write_disk.frame.Rd
index 60c2bf09..c152c851 100644
--- a/man/write_disk.frame.Rd
+++ b/man/write_disk.frame.Rd
@@ -12,10 +12,8 @@ write_disk.frame(
     recommend_nchunks(diskf)),
   overwrite = FALSE,
   shardby = NULL,
+  partitionby = NULL,
   compress = 50,
-  shardby_function = "hash",
-  sort_splits = NULL,
-  desc_vars = NULL,
   ...
 )
 
@@ -32,13 +30,9 @@ output_disk.frame(...)
 
 \item{shardby}{the columns to shard by}
 
-\item{compress}{compression ratio for fst files}
-
-\item{shardby_function}{splitting of chunks: "hash" for hash function or "sort" for semi-sorted chunks}
+\item{partitionby}{the columns to (folder) partition by}
 
-\item{sort_splits}{for the "sort" shardby function, a dataframe with the split values.}
-
-\item{desc_vars}{for the "sort" shardby function, the variables to sort descending.}
+\item{compress}{compression ratio for fst files}
 
 \item{...}{passed to cmap.disk.frame}
 }
diff --git a/manuscript/Book.txt b/manuscript/Book.txt
deleted file mode 100644
index 08609b2e..00000000
--- a/manuscript/Book.txt
+++ /dev/null
@@ -1,2 +0,0 @@
-taming-larger-than-ram-data-course.txt
-
diff --git a/manuscript/taming-larger-than-ram-data-course.txt b/manuscript/taming-larger-than-ram-data-course.txt
deleted file mode 100644
index 864097c7..00000000
--- a/manuscript/taming-larger-than-ram-data-course.txt
+++ /dev/null
@@ -1,2 +0,0 @@
-# Introduction
-This course will teach you all the ins and outs of {disk.frame}. The instructor is the author of {disk.frame}. Enjoy!
diff --git a/presentation/.gitignore b/presentation/.gitignore
deleted file mode 100644
index 0586a46d..00000000
--- a/presentation/.gitignore
+++ /dev/null
@@ -1,2 +0,0 @@
-*.pptx
-*.pdf
diff --git a/presentation/rstudio conf 2021/1min video script b/presentation/rstudio conf 2021/1min video script
deleted file mode 100644
index 53337167..00000000
--- a/presentation/rstudio conf 2021/1min video script	
+++ /dev/null
@@ -1,11 +0,0 @@
-Hi my name is ZedJ and I am a Data Scientist local to Melbourne. I am a keen contributor to open source data science projects, one of which I want to talk about at rstudio:conf 2021. That project is {disk.frame} - a larger-than-RAM data manipulation package.
-
-R needs to load the data in its entirety into RAM. However, RAM is a precious resource and often do run out.
-
-{disk.frame} solves this issue by providing a 100%-R framework to manipulate data on disk. A modern laptop with {disk.frame} can comfortably handle 100GB's of data.
-
-Also, {disk.frame} uses {dplyr} verbs to manipulate data so useRs will find it very easy to pick up.
-
-Finally, because {disk.frame} is 100%-R, you can use any R package with it at no extra cost unlike Spark.
-
-The talk I propose will introduce {disk.frame} to users with the needs to manipulate large amounts of data with minimal setup. They will find {disk.frame} very familiar, as {disk.frame} uses {dplyr} verbs directly;. Some users rely on DBMS (e.g. PostgresSQL), Spark, or SAS to manage their large datasets. They will find lots of benefits in switching to {disk.frame}, which will allow them to keep their workflow in R for as long as possible. Because {disk.frame} can run R functions natively, they will find that {disk.frame} allows them to many R packages directly with {disk.frame}.
diff --git a/presentation/rstudio conf 2021/Abstract proposal.md b/presentation/rstudio conf 2021/Abstract proposal.md
deleted file mode 100644
index e7406d3f..00000000
--- a/presentation/rstudio conf 2021/Abstract proposal.md	
+++ /dev/null
@@ -1,5 +0,0 @@
-Learn how to handle 100GBs of data with ease using {disk.frame} - the larger-than-RAM-data manipulation package.
-
-R loads data in its entirety into RAM. However, RAM is a precious resource and often do run out. That's why most R user would have run into the "cannot allocate vector of size xxB." error at some point.
-
-However, the need to handle larger-than-RAM data doesn't go away just because RAM isn't large enough. So many useRs turn to big data tools like Spark for the task. In this talk, I will make the case that {disk.frame} is sufficient and often preferable for manipulating larger-than-RAM data that fit on disk. I will show how you can apply familiar {dplyr}-verbs to manipulate larger-than-RAM data with {disk.frame}.
\ No newline at end of file
diff --git a/presentation/twin cities/.gitignore b/presentation/twin cities/.gitignore
deleted file mode 100644
index 26416673..00000000
--- a/presentation/twin cities/.gitignore	
+++ /dev/null
@@ -1 +0,0 @@
-*.mp4
diff --git a/presentation/user 2020/user-2020-tut-proposal.Rmd b/presentation/user 2020/user-2020-tut-proposal.Rmd
deleted file mode 100644
index 83452169..00000000
--- a/presentation/user 2020/user-2020-tut-proposal.Rmd	
+++ /dev/null
@@ -1,103 +0,0 @@
----
-title: "Easy Larger-than-RAM data manipulation with disk.frame - useR! 2020 tutorial proposal"
-author: "ZJ"
-date: "11/25/2019"
-output: pdf_document
----
-
-```{r setup, include=FALSE}
-knitr::opts_chunk$set(echo = TRUE)
-```
-
-## Title
-
-Handling 100GBs of data with ease - Larger-than-RAM data manipulation with disk.frame
-
-## Audience
-
-The tutorial aims to introduce {disk.frame} to users with the needs to manipulate large amounts of data. The audience will benefit the most from this tutorial if they are already familiar with popular R data frameworks such as dplyr and {data.table} because {disk.frame} uses dplyr and data.table primitives like `select`, and `dt[i, j, by]`.
-
-Some users rely on DBMS (e.g. PostgresSQL), Spark, or SAS to manage their large dataset. They will find tremendous benefit in switching to {disk.frame}, which will allow them to keep their workflow in R for as long as possible.
-
-## Instructor background
-
-I have more than ten years of experience using R and enterprise data manipulation tools like SAS and SQL. I have experience teaching Data Science courses in R in a group setting. I am the author of the {disk.frame} package, hence I have intimate knowledge of the inner workings of {disk.frame} and is, therefore, well-positioned to teach {disk.frame} for larger-than-RAM data manipulation.
-
-
-The website I created for {disk.frame} is 
-* https://diskframe.com and 
-
-I have given a useR! 2019 talk on {disk.frame}, see
-* https://www.youtube.com/watch?v=3XMTyi_H4q4&t=1s
-
-My LinkedIn
-* https://www.linkedin.com/in/daizj/
-
-Here is an R course I wrote to Data Science in general
-* https://github.com/xiaodaigh/intro_r_data_science
-
-## Domain
-
-Big Data manipulation - efficiently wrangle data that doesn't fit into RAM
-
-
-## Points of appeal (75 words)
-Every R user would have run into the "cannot allocate vector of size xxxB." error at some point. For most applications, R needs to load the data in its entirety into RAM. However, RAM is a precious resource and often do run out. 
-
-{disk.frame} solves this issue by providing a framework to manipulate data on disk and minimize RAM usage. By using {disk.frame}, the user can perform analysis on much larger data than is normally possible with R. A modern laptop with {disk.frame} can comfortably handle 100GB's of data.
-
-
-## Learning objectives: 100 – 200 words
-In this tutorial, the user will learn how to use {disk.frame} effectively to manipulate datasets up to 100GBs in size on a single computer. 
-
-After the tutorial, the user should be able to 
-
-* learn what the pros and cons of common "big data" technologies like DBMS, Spark, and {disk.frame} are
-* understand how {disk.frame} can help with manipulating large amounts of data
-* understand when to use {disk.frame} and when not to
-* confidently use {disk.frame} for any data manipulation task and not worry about running out of RAM
-* load large datasets into {disk.frame} format
-* manipulate large {disk.frame} files
-* summarize large {disk.frame} files
-
-## Computing requirement
-The requirement for {disk.frame} is R 3.4 and the ability to install packages via `install.packages`. A laptop is advised but not necessary.
-
-If attendees wish to follow along, I recommend that they should have RStudio or Jupyter notebook set up. However, that is not necessary as I will running through code that they can obtain from Github.
-
-## Teaching Assistant
-I don't think Teaching Assistants are necessary. Installing disk.frame is simply running `install.packages("disk.frame")`.
-
-However, anyone familiar with RStudio setup and installation of packages is welcome as Teaching Assistant. Any expertise in the installation and setting of Jupyter notebook or Jupyter Lab would be highly appreciated.
-
-## Lesson Plan
-The structure will be very informal, and questions are encouraged at all points along the way. 
-
-* Introduction to disk.frame; how it's structured and why it's fast
-* Pros and cons of disk.frame; when to use disk.frame and when not to
-* Follow along: 
-* generate a fake dataset that is reasonably large (so avoiding a large download from internet)
-* Convert the dataset to CSV
-* Show how to load the data from CSV to {disk.frame}
-* Manipulate the {disk.frame}
-* Do group-by and summary statistics with {disk.frame}
-* show how dplyr and data.table syntax can operate on {disk.frame}
-* Explain how to perform group-by efficiently 
-* Demonstrate how joining works for large {disk.frame}s
-
-**** break ****
-
-After the break, we will cover advanced {disk.frame} topics
-
-* Explain how to extend {disk.frame} with a custom function
-* Explain how to tune {disk.frame} for maximum efficiency
-* Show some convenience features for {disk.frame} in RStudio if time permits
-* Conclusion and Q&A
-
-## Other Considerations
-The user can consult https://diskframe.com. It is regularly updated with relevant {disk.frame} resources like articles and how-to-guides.
-
-The intention is to show how to manipulate larger-than-RAM data with {disk.frame}. However, we will not download any such data from the internet. Instead, we will use an existing function to generate such data. Hence, it will not pose any constraint on internet access nor speed. However, students wishing to follow along on laptops will need to install {disk.frame} via `install.packages("disk.frame")`.
-
-
-
diff --git a/tests/testthat/test-data-table.r b/tests/testthat/test-data-table.r
index d1629450..fb58f324 100644
--- a/tests/testthat/test-data-table.r
+++ b/tests/testthat/test-data-table.r
@@ -45,10 +45,12 @@ test_that("data.table global vars",  {
   }
   
   # Check function with data.table object
-  grep_storm_name(storms_dt, "^A")
+  a = grep_storm_name(storms_dt, "^A")
   
   # Check function with diskframe object
-  grep_storm_name(storms_df, "^A")
+  b = grep_storm_name(storms_df, "^A")
+  
+  expect_equal(a, b)
 })
 
 teardown({
diff --git a/tests/testthat/test-dplyr-verbs.r b/tests/testthat/test-dplyr-verbs.r
index 01314556..95ccd489 100644
--- a/tests/testthat/test-dplyr-verbs.r
+++ b/tests/testthat/test-dplyr-verbs.r
@@ -7,31 +7,31 @@ setup({
 
 test_that("testing select", {
   b = disk.frame(file.path(tempdir(), "tmp_b_dv.df"))
-  
-  df = b %>% 
-    select(a) %>% 
+
+  df = b %>%
+    select(a) %>%
     collect
-  
+
   expect_equal(ncol(df), 1)
 })
 
 test_that("testing rename", {
   b = disk.frame(file.path(tempdir(), "tmp_b_dv.df"))
-  
-  df = b %>% 
-    rename(a_new_name = a) %>% 
+
+  df = b %>%
+    rename(a_new_name = a) %>%
     collect
-  
+
   expect_setequal(colnames(df), c("a_new_name", "b"))
 })
 
 test_that("testing filter", {
   b = disk.frame(file.path(tempdir(), "tmp_b_dv.df"))
-  
-  df = b %>% 
-    filter(a <= 100, b <= 10) %>% 
+
+  df = b %>%
+    filter(a <= 100, b <= 10) %>%
     collect
-  
+
   expect_setequal(nrow(df), 10)
 })
 
@@ -44,109 +44,118 @@ test_that("testing filter - global vars", {
     filter(a <= one_hundred, b <= 10) %>% 
     collect
   
+  
   expect_setequal(nrow(df), 10)
+  
+  
+  # testing_env = new.env()
+  # 
+  # assign("one_hundred", 100, testing_env)
+  # assign("b", disk.frame(file.path(tempdir(), "tmp_b_dv.df")), testing_env)
+  # 
+  # eval(parse(text=code), envir = testing_env, enclos=emptyenv())
 })
 
 test_that("testing mutate", {
   b = disk.frame(file.path(tempdir(), "tmp_b_dv.df"))
-  
-  
-  df = b %>% 
-    mutate(d = a + b) %>% 
+
+
+  df = b %>%
+    mutate(d = a + b) %>%
     collect
-  
+
   expect_setequal(sum(df$d), sum(df$a, df$b))
-  
-  df = b %>% 
+
+  df = b %>%
     mutate(e = rank(desc(a))) %>%
     collect
-  
+
   expect_equal(nrow(df), 100)
-  
+
   # need to test
   value <- as.disk.frame(tibble(char = LETTERS,
                                 num = 1:26))
   df2 = value %>%
     dplyr::mutate(b =  case_when(
       char %in% c("A", "B", "C") ~ "1",
-      TRUE ~ char)) %>% 
+      TRUE ~ char)) %>%
     collect
-  
+
   expect_equal(ncol(df2), 3)
-  
+
   # testing
   fn = function(a, b) {
     a+b
   }
-  
+
   df3 = value %>%
     dplyr::mutate(b =  fn(num, num)) %>%
     collect
-  
+
   expect_equal(ncol(df3), 3)
-  
-  
+
+
   global_var = 100
-  
+
   df4 = value %>%
     dplyr::mutate(b =  fn(num, num), d = global_var*2) %>%
     collect
-  
+
   expect_equal(ncol(df4), 4)
   expect_true(all(df4$d == 200))
 })
 
 test_that("testing mutate user-defined function", {
    b = disk.frame(file.path(tempdir(), "tmp_b_dv.df"))
-   
-   
+
+
    udf = function(a1, b1) {
      a1 + b1
    }
-   
+
    df = b %>%
      mutate(d = udf(a,b)) %>%
      collect
-   
+
    expect_setequal(sum(df$d), sum(df$a, df$b))
 })
 
 test_that("testing transmute", {
   b = disk.frame(file.path(tempdir(), "tmp_b_dv.df"))
-  
-  df = b %>% 
-    transmute(d = a + b) %>% 
+
+  df = b %>%
+    transmute(d = a + b) %>%
     collect
-  
+
   expect_setequal(names(df), c("d"))
 })
 
 test_that("testing arrange", {
   b = disk.frame(file.path(tempdir(), "tmp_b_dv.df"))
-  
+
   expect_warning(df <- b %>%
-    mutate(random_unif = runif(dplyr::n())) %>% 
+    mutate(random_unif = runif(dplyr::n())) %>%
     arrange(desc(random_unif)))
-  
+
   df <- b %>%
-    mutate(random_unif = runif(dplyr::n())) %>% 
+    mutate(random_unif = runif(dplyr::n())) %>%
     chunk_arrange(desc(random_unif))
-  
+
   x = purrr::map_lgl(1:nchunks(df), ~{
     is.unsorted(.x) == FALSE
   })
-  
+
   expect_true(all(x))
 })
 
 test_that("testing chunk_summarise", {
   b = disk.frame(file.path(tempdir(), "tmp_b_dv.df"))
-  
+
   df = b %>%
-    chunk_summarise(suma = sum(a)) %>% 
-    collect %>% 
+    chunk_summarise(suma = sum(a)) %>%
+    collect %>%
     summarise(suma = sum(suma))
-  
+
   expect_equal(df$suma, collect(b)$a %>% sum)
 })
 
@@ -154,21 +163,36 @@ test_that("testing mutate within function works", {
   test_f <- function(params, x_df){
     x_df %>% mutate(aha = params[1]*cyl + params[2]*disp)
   }
-  
+
   expect_true("aha" %in% names(test_f(c(1, 2), mtcars)))
 })
 
 test_that("filter failure: prevent github #191 regression",  {
   flights_df = as.disk.frame(nycflights13::flights)
-  
+
   # expect error due to syntax error
-  expect_error(flights_df %>% 
-    filter(tailnum %in% paste0(unique(nycflights13::flights$tailnum)[1:60]), "") %>% 
+  expect_error(flights_df %>%
+    filter(tailnum %in% paste0(unique(nycflights13::flights$tailnum)[1:60]), "") %>%
     collect)
-  
+
   delete(flights_df)
 })
 
+test_that("testing {{}}", {
+  ok <- function(input_data, col) {
+    val = 2
+    input_data %>% 
+      mutate({{col}} + val, val2 = speed+dist) %>% 
+      collect
+  }
+  
+  a = ok(as.disk.frame(cars), dist)
+         
+  b = ok(cars, dist)
+  
+  expect_equal(a, b)
+})
+
 
 teardown({
   fs::dir_delete(file.path(tempdir(), "tmp_b_dv.df"))
diff --git a/tests/testthat/test-group-by.R b/tests/testthat/test-group-by.R
index e61f75d3..e28b610b 100644
--- a/tests/testthat/test-group-by.R
+++ b/tests/testthat/test-group-by.R
@@ -299,6 +299,51 @@ test_that("tests for {{}}", {
   expect_equal(a, b)
 })
 
+
+test_that("tests for global", {
+  # TODO make this work
+  val = 2
+  val2 = 2
+  
+  b = iris %>% 
+    as.disk.frame %>% 
+    group_by(as.integer(Species) + val, as.integer(Species) + val2) %>% 
+    summarize(mean(Petal.Length)) %>% 
+    collect
+  
+  a = iris %>% 
+    group_by(as.integer(Species) + val, as.integer(Species) + val2) %>% 
+    summarize(mean(Petal.Length)) %>% 
+    collect
+  
+  expect_equal(b, a)
+  
+  b = iris %>%
+    as.disk.frame %>%
+    group_by(as.integer(Species) + val, as.integer(Species) + val2) %>%
+    summarize(mean(Petal.Length+val2)) %>%
+    collect
+
+  a = iris %>%
+    group_by(as.integer(Species) + val, as.integer(Species) + val2) %>%
+    summarize(mean(Petal.Length+val2)) %>%
+    collect
+
+  expect_equal(a, b)
+   
+   a = iris %>%
+    summarize(mean(Petal.Length+val2))
+
+  b = iris %>%
+    as.disk.frame %>%
+    summarize(mean(Petal.Length+val2)) %>%
+    collect
+
+  expect_equal(names(a), names(b))
+
+  expect_equal(a[[1]], b[[1]])
+})
+
 teardown({
   # fs::file_delete(file.path(tempdir(), "tmp_pls_delete_gb.csv"))
   # fs::dir_delete(file.path(tempdir(), "tmp_pls_delete_gb.df"))
diff --git a/tests/testthat/test-names.r b/tests/testthat/test-names.r
index 2d114aeb..6cc65022 100644
--- a/tests/testthat/test-names.r
+++ b/tests/testthat/test-names.r
@@ -12,13 +12,13 @@ test_that("testing names", {
   expect_setequal(names(b), c("a","b"))
 })
 
-test_that("testing names with lazyfn", {
-  b = disk.frame(file.path(tempdir(), "tmp_names.df")) %>% 
-    mutate(d = a + b)
-  
-  expect_setequal(colnames(b), c("a","b", "d"))
-  expect_setequal(names(b), c("a","b", "d"))
-})
+# test_that("testing names with lazyfn", {
+#   b = disk.frame(file.path(tempdir(), "tmp_names.df")) %>% 
+#     mutate(d = a + b)
+#   
+#   expect_setequal(colnames(b), c("a","b", "d"))
+#   expect_setequal(names(b), c("a","b", "d"))
+# })
 
 teardown({
   fs::dir_delete(file.path(tempdir(), "tmp_names.df"))
diff --git a/tests/testthat/test-partitions.r b/tests/testthat/test-partitions.r
new file mode 100644
index 00000000..3fb65f29
--- /dev/null
+++ b/tests/testthat/test-partitions.r
@@ -0,0 +1,24 @@
+context("test-partitions")
+
+setup({
+  setup_disk.frame(workers = 1)
+})
+
+test_that("test partitions", {
+  a = as.disk.frame(cars) %>% 
+    write_disk.frame(partitionby="speed", outdir=tempfile())
+  
+  a = a %>% 
+    partition_filter(speed < 10) %>%
+    collect %>% 
+    arrange(speed, dist)
+  
+  
+  b = cars %>% 
+    filter(speed < 10) %>% 
+    arrange(speed, dist)
+  
+  for(n in names(a)) {
+    expect_equal(a %>% select(!!n), b %>% select(!!n))
+  }
+})
diff --git a/tutorials/0_feature_tools_drake.r b/tutorials/0_feature_tools_drake.r
deleted file mode 100644
index 2ad73881..00000000
--- a/tutorials/0_feature_tools_drake.r
+++ /dev/null
@@ -1,20 +0,0 @@
-library(disk.frame)
-setup_disk.frame()
-
-library(drake)
-
-plan = drake::drake_plan(
-  df = target(
-    disk.frame::csv_to_disk.frame(
-      file_in("D:/data/mortgage-risk/feature_matrix_cleaned.csv"),
-      outdir = drake::drake_tempfile()
-    ),
-    format = "diskframe"
-  )
-)
-
-system.time(make(plan))
-
-loadd(df)
-
-df
diff --git a/tutorials/1_feature_tools.r b/tutorials/1_feature_tools.r
deleted file mode 100644
index e5bd3282..00000000
--- a/tutorials/1_feature_tools.r
+++ /dev/null
@@ -1,16 +0,0 @@
-library(disk.frame)
-# this willl set disk.frame with multiple workers
-setup_disk.frame()
-# this will allow unlimited amount of data to be passed from worker to worker
-options(future.globals.maxSize = Inf)
-
-
-system.time(
-  a <- 
-    csv_to_disk.frame(
-      "c:/data/feature_matrix_cleaned.csv", 
-      in_chunk_size = 1e5/4
-    )
-)
-
-system.time(rechunk(a, 16, outdir = "c:/data/ft.df", shardby="sk_id_curr"))
diff --git a/tutorials/2_feature_tools.r b/tutorials/2_feature_tools.r
deleted file mode 100644
index 88b3c081..00000000
--- a/tutorials/2_feature_tools.r
+++ /dev/null
@@ -1,67 +0,0 @@
-library(disk.frame)
-library(data.table)
-library(drake)
-# this willl set disk.frame with multiple workers
-setup_disk.frame()
-# this will allow unlimited amount of data to be passed from worker to worker
-options(future.globals.maxSize = Inf)
-
-loadd(gd)
-
-system.time(corgd = cor(gd))
-
-library(data.table)
-
-system.time(
-  a <- 
-    data.table::fread(
-      "C:/Users/ZJ.DAI/Documents/git/disk.frame/tutorials/gd.csv"))
-
-
-if(nchunks(df) != 32) {
-  rechunk(df, 32, shardby= "sk_id_curr")
-}
-
-# rechunk(df, 32)
-plan <- drake::drake_plan(
-  df = disk.frame(file_in("d:/data/ft.df")),
-  df = {
-    if(nchunks(df) != 32) {
-      rechunk(df, 32, shardby= "sk_id_curr")
-    }
-  },
-  bads = df %>% 
-    filter(target == 1) %>%
-    collect(parallel=FALSE)
-)
-
-
-
-system.time()
-
-sfrac = nrow(bads)/(nrow(df) - nrow(bads))
-
-# sfrac = 0.07490269
-system.time(df1 <- df %>% 
-  filter(target == 0) %>% 
-  sample_frac(size = sfrac) %>% 
-  collect(parallel = FALSE))
-
-goods = df1
-
-gd = rbindlist(list(bads, goods))
-gd
-
-
-pryr::object_size(gd)
-arrow::write_parquet(gd, "gd.parquet")
-fst::write_fst(gd, "gd.fst")
-data.table::fwrite(gd, "gd.csv")
-
-sample_frac(gd, 0.25) %>% 
-  data.table::fwrite("gd.sample.csv")
-
-gd = fst::read_fst("gd.fst")
-library(DataExplorer)
-DataExplorer::create_report(gd)
-
diff --git a/tutorials/2_feature_tools_drake.r b/tutorials/2_feature_tools_drake.r
deleted file mode 100644
index a8cb864c..00000000
--- a/tutorials/2_feature_tools_drake.r
+++ /dev/null
@@ -1,63 +0,0 @@
-library(disk.frame)
-library(data.table)
-library(drake)
-library(DataExplorer)
-# this willl set disk.frame with multiple workers
-setup_disk.frame()
-# this will allow unlimited amount of data to be passed from worker to worker
-options(future.globals.maxSize = Inf)
-
-# rechunk(df, 32)
-plan <- drake::drake_plan(
-  df1 = {
-    df1 = disk.frame(file_in("d:/data/ft.df"))
-    if(nchunks(df1) != 32) {
-      rechunk(df1, 32, shardby= "sk_id_curr")
-    }
-    df1
-    },
-  bads = df1 %>% 
-    filter(target == 1) %>%
-    collect(parallel=FALSE),
-  sfrac = nrow(bads)/(nrow(df1) - nrow(bads)),
-  goods = 
-    df1 %>% 
-      filter(target == 0) %>% 
-      sample_frac(size = sfrac) %>% 
-      collect(parallel = FALSE),
-  gd = target(
-    rbindlist(list(bads, goods)),
-    format = "fst"
-  ),
-  arrow::write_parquet(gd, file_out("gd.parquet")),
-  data.table::fwrite(gd, file_out("gd.csv")),
-  gd_sample = sample_frac(gd, 0.25),
-  data.table::fwrite(gd_sample, file_out("gd.sample.csv"))#,
-  # intro_plot = DataExplorer::create_report(
-  #   gd,
-  #   output_file = file_out("output/dataexplorer/plot_intro.html")
-  # )
-)
-
-make(plan)
-
-loadd(gd)
-
-intro_plot = DataExplorer::create_report(
-  gd,
-  output_file = file_out("output/dataexplorer/plot_intro.html")
-)
-
-build_times(plan)
-# 
-# profile_missing(gd) ->pmgd
-# 
-# plot_histogram(pmgd)
-# 
-# 
-# intro_plot = DataExplorer::create_report(
-#   gd,
-#   configure_cache()
-# )
-#   
-# introduce(gd)
diff --git a/tutorials/2_feature_tools_drake.rmd b/tutorials/2_feature_tools_drake.rmd
deleted file mode 100644
index cdbdc84c..00000000
--- a/tutorials/2_feature_tools_drake.rmd
+++ /dev/null
@@ -1,60 +0,0 @@
-```{r}
-library(disk.frame)
-library(data.table)
-library(drake)
-library(DataExplorer)
-# this willl set disk.frame with multiple workers
-setup_disk.frame()
-# this will allow unlimited amount of data to be passed from worker to worker
-options(future.globals.maxSize = Inf)
-```
-
-```{r}
-# rechunk(df, 32)
-plan <- drake::drake_plan(
-  df1 = {
-    df1 = disk.frame(file_in("d:/data/ft.df"))
-    if(nchunks(df1) != 32) {
-      rechunk(df1, 32, shardby= "sk_id_curr")
-    }
-    df1
-    },
-  bads = df1 %>% 
-    filter(target == 1) %>%
-    collect(parallel=FALSE),
-  sfrac = nrow(bads)/(nrow(df1) - nrow(bads)),
-  goods = 
-    df1 %>% 
-      filter(target == 0) %>% 
-      sample_frac(size = sfrac) %>% 
-      collect(parallel = FALSE),
-  gd = target(
-    rbindlist(list(bads, goods)),
-    format = "fst"
-  ),
-  arrow::write_parquet(gd, file_out("gd.parquet")),
-  data.table::fwrite(gd, file_out("gd.csv")),
-  gd_sample = sample_frac(gd, 0.25),
-  data.table::fwrite(gd_sample, file_out("gd.sample.csv")),
-  intro_plot = DataExplorer::create_report(
-    gd,
-    output_file = file_out("output/dataexplorer/plot_intro.html")
-  )
-)
-system.time(make(plan))
-```
-
-```{r}
-print(getwd())
-
-```
-
-
-```{r}
-loadd(gd)
-gd
-```
-
-``{r}
-build_times(plan)
-```
diff --git a/tutorials/2_feature_tools_no_plo.r b/tutorials/2_feature_tools_no_plo.r
deleted file mode 100644
index a96011cb..00000000
--- a/tutorials/2_feature_tools_no_plo.r
+++ /dev/null
@@ -1,52 +0,0 @@
-library(disk.frame)
-library(data.table)
-library(drake)
-library(DataExplorer)
-# this willl set disk.frame with multiple workers
-setup_disk.frame()
-# this will allow unlimited amount of data to be passed from worker to worker
-options(future.globals.maxSize = Inf)
-
-# rechunk(df, 32)
-plan <- drake::drake_plan(
-  df1 = {
-    df1 = disk.frame(file_in("d:/data/ft.df"))
-    if(nchunks(df1) != 32) {
-      rechunk(df1, 32, shardby= "sk_id_curr")
-    }
-    df1
-    },
-  bads = df1 %>% 
-    filter(target == 1) %>%
-    collect(parallel=FALSE),
-  sfrac = nrow(bads)/(nrow(df1) - nrow(bads)),
-  goods = 
-    df1 %>% 
-      filter(target == 0) %>% 
-      sample_frac(size = sfrac) %>% 
-      collect(parallel = FALSE),
-  gd = target(
-    rbindlist(list(bads, goods)),
-    format = "fst"
-  ),
-  arrow::write_parquet(gd, file_out("gd.parquet")),
-  data.table::fwrite(gd, file_out("gd.csv")),
-  gd_sample = sample_frac(gd, 0.25),
-  data.table::fwrite(gd_sample, file_out("gd.sample.csv"))
-)
-
-make(plan)
-
-loadd(gd)
-# 
-# profile_missing(gd) ->pmgd
-# 
-# plot_histogram(pmgd)
-# 
-# 
-# intro_plot = DataExplorer::create_report(
-#   gd,
-#   configure_cache()
-# )
-#   
-# introduce(gd)
diff --git a/tutorials/30-g.R b/tutorials/30-g.R
deleted file mode 100644
index 7b5a9ee7..00000000
--- a/tutorials/30-g.R
+++ /dev/null
@@ -1,49 +0,0 @@
-library(disk.frame)
-
-setup_disk.frame(12)
-
-a = disk.frame("c:/data/airontimecsv.df/")
-
-system.time(b <- a %>% 
-   group_by(YEAR, MONTH, DAY_OF_MONTH) %>% 
-   summarise(sum(DEP_DELAY)) %>% 
-   collect)
-
-
-path_to_data <- "c:/data/"
-rows = 148619656
-recommended_nchunks = recommend_nchunks(file.size(file.path(path_to_data, "combined.csv")))
-in_chunk_size = ceiling(rows/ recommended_nchunks)
-
-path_to_data = "c:/data/AirOnTimeCSV/"
-#path_to_data = "d:/data/"
-system.time(a <- csv_to_disk.frame(
-   list.files(path_to_data, pattern = ".csv$", full.names = TRUE),
-   outdir = file.path("c:/data/", "airontimecsv.df"),
-   colClasses = list(character = c("WHEELS_OFF", "WHEELS_ON"))
-))
-
-
-system.time(flights.df <- csv_to_disk.frame(
-   paste0(path_to_data, "combined.csv"), 
-   outdir = paste0(path_to_data, "combined.laf.df"),
-   in_chunk_size = in_chunk_size,
-   backend = "LaF"
-))
-
-system.time(a <- csv_to_disk.frame(
-   file.path(path_to_data, "combined.csv"),
-   outdir = file.path(path_to_data, "combined.readr.df"),
-   in_chunk_size = in_chunk_size,
-   colClasses = list(character = c("WHEELS_OFF","WHEELS_ON")),
-   chunk_reader = "readr"
-))
-
-system.time(a <- csv_to_disk.frame(
-   file.path(path_to_data, "combined.csv"),
-   outdir = file.path(path_to_data, "combined.readLines.df"),
-   in_chunk_size = in_chunk_size,
-   colClasses = list(character = c("WHEELS_OFF","WHEELS_ON")),
-   chunk_reader = "readLines"
-))
-
diff --git a/tutorials/bigreadr.R b/tutorials/bigreadr.R
deleted file mode 100644
index c2deac1e..00000000
--- a/tutorials/bigreadr.R
+++ /dev/null
@@ -1,59 +0,0 @@
-library(disk.frame)
-
-setup_disk.frame()
-
-system.time(b <- csv_to_disk.frame(
-  "c:/data/combined.csv",
-  colClasses = list(character = c("WHEELS_ON", "WHEELS_OFF")),
-  in_chunk_size = 4e6,
-  chunk_reader = "bigreadr"
-))
-
-system.time(nl <- bigreadr::nlines("c:/data/combined.csv"))
-system.time(a <- bigreadr::split_file(
-  "c:/data/combined.csv", 
-  every_nlines = 4e6, 
-  repeat_header = TRUE))
-
-system.time(b <- csv_to_disk.frame(
-  bigreadr::get_split_files(a),
-  header = TRUE,
-  colClasses = list(character = c("WHEELS_ON", "WHEELS_OFF"))
-))
-
-
-
-#' system('
-#' @echo off
-#' Setlocal EnableDelayedExpansion
-#' for /f "usebackq" %%a in (`dir /b %1`)  do (
-#'   for /f "usebackq" %%b in (`type %%a ^| find "" /v /c`) do (
-#'     set /a lines += %%b
-#'     )
-#'   )
-#' echo %lines%
-#' endlocal
-#' 
-#' CountLines "c:\data\combined.csv"
-#'        ')
-
-a = disk.frame(file.path(path_to_data, "airontimecsv.df"))
-
-
-system.time(r_mean_del_delay <- a %>% 
-              group_by(YEAR, MONTH, DAY_OF_MONTH) %>% 
-              summarise(sum_delay = sum(DEP_DELAY, na.rm = TRUE), n = n()) %>% 
-              collect %>% 
-              group_by(YEAR, MONTH, DAY_OF_MONTH) %>% 
-              summarise(mean_delay = sum(sum_delay)/sum(n)))
-
-
-library(lubridate)
-
-dep_delay =  r_mean_del_delay %>%
-  arrange(YEAR, MONTH, DAY_OF_MONTH) %>%
-  mutate(date = ymd(paste(YEAR, MONTH, DAY_OF_MONTH, sep = "-")))
-
-library(ggplot2)
-ggplot(dep_delay, aes(date, mean_delay)) + geom_smooth()
-
diff --git a/tutorials/cran-logs.r b/tutorials/cran-logs.r
deleted file mode 100644
index 3628e513..00000000
--- a/tutorials/cran-logs.r
+++ /dev/null
@@ -1,78 +0,0 @@
-a = data.table::fread("c:/Users/ZJ.DAI/Downloads/a (1).csv")
-
-cran.stats::read_logs(start=as.Date("2019-08-27"), as.Date("2019-08-27"), dir = "d:/data/cran-logs")
-
-library(dplyr)
-library(data.table)
-a[,date:=as.Date(date, "%Y-%m-%d")]
-
-a1 = a %>% 
-  # filter(N < 100) %>% 
-  right_join(data.frame(date = seq(min(a$date), max(a$date), by="day")))
-
-setDT(a1)
-a1[,N := ifelse(is.na(N), (lag(N) + lead(N))/2, N)]
-a2 = a1[!is.na(N)]
-fwrite(a2, "c:/data/a1.csv", row.names  = FALSE)
-
-a3 = cbind(a2, a2[,shift(N,-3:3, type = "lead")] %>% 
-             apply(1,mean)) %>% 
-  filter(!is.na(V2)) %>% 
-  select(-N)
-fwrite(a3, "c:/data/a3.csv", row.names  = FALSE)
-
-plot(a3)
-
-library(pixapi)
-
-result = pixapi::get_pix_forecasts("https://ep-d5adc7ba-0218-44f2-9ed9-eb6d202784e8.serving.aiaengine.com/invocations", 14)
-setDT(result)
-a4 = rbind(a3, result[,.(date = as.Date(date), V2)])
-plot(a4)
-
-a4 %>% 
-  full_join(a1) -> a5
-
-write.csv(a5, "a5.csv")
-
-
-rs = cran.stats::read_logs(start = as.Date("2019-09-26"), as.Date("2019-09-30"), dir = "cran-logs")
-
-library(drake)
-library(disk.frame)
-setup_disk.frame()
-
-
-
-plan <- drake_plan(
-  lf1 = list.files(
-      file_in("d:/data/cran-logs"), 
-      pattern="*.csv", 
-      full.names = T
-    ),
-  x = target(
-    disk.frame::csv_to_disk.frame(
-      lf1,
-      outdir = drake_tempfile()),
-    format = "diskframe"
-  )
-)
-
-make(plan)
-
-config <- drake_config(plan)
-vis_drake_graph(config)
-
-x = readd(lf1)
-
-head(x)
-
-readd(x)
-system.time(
-  df <- csv_to_disk.frame(list.files("d:/data/cran-logs", pattern="*.csv", full.names = T))
-)
-
-head(df)
-
-df = shard(df, shardby = "date")
-df[package=="disk.frame",.N, date, keep=c("package","date")][order(date)]
diff --git a/tutorials/fannie-mae-to-parquet/0-setup.r b/tutorials/fannie-mae-to-parquet/0-setup.r
deleted file mode 100644
index dc5fd196..00000000
--- a/tutorials/fannie-mae-to-parquet/0-setup.r
+++ /dev/null
@@ -1,15 +0,0 @@
-library(disk.frame)
-setup_disk.frame()
-
-
-fm.df = disk.frame("c:/data/fannie_mae_disk_frame/fm.df/")
-
-fm.df %>% 
-  mutate(
-    yr = substr(monthly.rpt.prd, 7, 11) %>% as.numeric, 
-    mth = substr(monthly.rpt.prd, 1, 2) %>% as.numeric) %>% 
-  rechunk(nchunks = nchunks(fm.df), outdir = "c:/data/fannie_mae_disk_frame/fm_by_yr_mth.df/", shardby = c("yr", "mth"))
-
-
-
-
diff --git a/tutorials/fannie-mae-to-parquet/1-testing.r b/tutorials/fannie-mae-to-parquet/1-testing.r
deleted file mode 100644
index 99008220..00000000
--- a/tutorials/fannie-mae-to-parquet/1-testing.r
+++ /dev/null
@@ -1,7 +0,0 @@
-library(disk.frame)
-setup_disk.frame()
-
-df1 = disk.frame("c:/data/fannie_mae_disk_frame/fm_by_yr_mth.df/")
-system.time(a <- df1[,.N, .(yr, mth), keep=c("yr", "mth")])
-
-a
diff --git a/tutorials/fannie_mae/00_setup.r b/tutorials/fannie_mae/00_setup.r
deleted file mode 100644
index 1fb5ffcd..00000000
--- a/tutorials/fannie_mae/00_setup.r
+++ /dev/null
@@ -1,66 +0,0 @@
-library(glue)
-library(purrr)
-library(fst)
-library(tidyr)
-library(ggplot2)
-library(stringr)
-library(xgboost)
-library(lubridate)
-library(future.apply)
-library(data.table)
-library(disk.frame)
-
-setup_disk.frame()
-
-
-raw_harp_data_path = "c:/data/fannie_mae/harp_files"
-#raw_harp_data_path = "D:/data/fannie_mae/harp_files"
-
-#raw_perf_data_path = "c:/data/Performance_All/"
-
-## read the split data set is ALOT (6x) faster
-raw_perf_data_path = "c:/data/Performance_All_split/"
-#raw_perf_data_path = "d:/data/Performance_All/"
-
-
-raw_harp_data_path = "c:/data/fannie_mae/harp_files"
-#raw_harp_data_path = "D:/data/fannie_mae/harp_files"
-
-# level of compression from 1 to 100 where 100 is the highest level
-compress = 50
-
-# where the outputs go
-#outpath = "c:/data/fannie_mae_disk_frame/"
-outpath = "c:/data/fannie_mae_disk_frame/"
-
-#acqzip_file_path = "c:/data/Acquisition_All.zip"
-acqzip_file_path = "c:/data/fannie_mae/Acquisition_All.zip"
-
-#appl_path = "C:/data/"
-appl_path = "c:/data"
-
-Performance_ColClasses = 
-  c("character", "character", "character", "numeric", "numeric", "numeric", "numeric", 
-    "numeric", "character", "character", "character", "character", "character", "character", 
-    "character", "character", "character", "numeric", "numeric", "numeric", "numeric", "numeric", 
-    "numeric", "numeric", "numeric", "numeric", "numeric", "numeric", "character", "numeric", 
-    "character")
-Performance_Variables = 
-  c("LOAN_ID", "Monthly.Rpt.Prd", "Servicer.Name", "LAST_RT", "LAST_UPB", "Loan.Age", 
-    "Months.To.Legal.Mat" , "Adj.Month.To.Mat", "Maturity.Date", "MSA", "Delq.Status", 
-    "MOD_FLAG", "Zero.Bal.Code", "ZB_DTE", "LPI_DTE", "FCC_DTE","DISP_DT", "FCC_COST", 
-    "PP_COST", "AR_COST", "IE_COST", "TAX_COST", "NS_PROCS", "CE_PROCS", "RMW_PROCS", 
-    "O_PROCS", "NON_INT_UPB", "PRIN_FORG_UPB_FHFA", "REPCH_FLAG", "PRIN_FORG_UPB_OTH", 
-    "TRANSFER_FLG") %>% tolower
-
-dfiles = dir(raw_perf_data_path, full.names = T)
-short_dfiles = dir(raw_perf_data_path)
-
-Acquisitions_Variables = c("LOAN_ID", "ORIG_CHN", "Seller.Name", "ORIG_RT", "ORIG_AMT", "ORIG_TRM", "ORIG_DTE"
-                           ,"FRST_DTE", "OLTV", "OCLTV", "NUM_BO", "DTI", "CSCORE_B", "FTHB_FLG", "PURPOSE", "PROP_TYP"
-                           ,"NUM_UNIT", "OCC_STAT", "STATE", "ZIP_3", "MI_PCT", "Product.Type", "CSCORE_C", "MI_TYPE", "RELOCATION_FLG") %>% tolower()
-
-Acquisition_ColClasses = c("character", "character", "character", "numeric", "numeric", "integer", "character", "character", "numeric",
-                           "numeric", "character", "numeric", "numeric", "character", "character", "character", "character", "character",
-                           "character", "character", "numeric", "character", "numeric", "numeric", "character")
-
diff --git a/tutorials/fannie_mae/01a_read_from_csv.r b/tutorials/fannie_mae/01a_read_from_csv.r
deleted file mode 100644
index e97b7c65..00000000
--- a/tutorials/fannie_mae/01a_read_from_csv.r
+++ /dev/null
@@ -1,40 +0,0 @@
-source("tutorials/fannie_mae/00_setup.r")
-
-# number of rows to read in from each file in one go
-nreadin = NULL
-
-# your ram size in G
-# this is needed as memomry.limit seems broken 
-ram.size = 64
-nc = parallel::detectCores(logical = FALSE)
-conservatism = 2
-minchunks = nc
-# compression ratio, max = 100 for best compression but slower running speed
-# compress = 100
-
-# set up some variable for future use
-
-full_file_path = dir(raw_perf_data_path, full.names = T)
-
-
-# randomise it to maximize the chance of a good load balancing
-#set.seed(1)
-#full_file_path = sample(full_file_path, length(full_file_path))
-
-# convert CSV in parallel
-pt <- proc.time()
-tot_file_size = sum(file.size(full_file_path))/1024^3
-res = csv_to_disk.frame(
-    full_file_path[1:6], 
-    outdir = file.path(outpath, "raw_fannie_mae"), 
-    #shardby = "loan_id", 
-    colClasses = Performance_ColClasses, 
-    col.names = Performance_Variables, 
-    sep = "|", 
-    in_chunk_size = nreadin, 
-    nchunks = max(round(tot_file_size/ram.size*nc)*nc*conservatism, nc),
-    overwrite = TRUE,
-    .progress = TRUE # show progress
-)
-print(data.table::timetaken(pt))
-
diff --git a/tutorials/fannie_mae/01a_read_from_csv_SINGLE_THREADED.r b/tutorials/fannie_mae/01a_read_from_csv_SINGLE_THREADED.r
deleted file mode 100644
index be535fed..00000000
--- a/tutorials/fannie_mae/01a_read_from_csv_SINGLE_THREADED.r
+++ /dev/null
@@ -1,37 +0,0 @@
-source("inst/fannie_mae/0_setup.r")
-
-# number of rows to read in
-compress = 50
-
-# how many chunks do we need?
-relative_file_path = dir(raw_perf_data_path)
-full_file_path = dir(raw_perf_data_path, full.names = T)
-
-file_sizes = purrr::map_dbl(full_file_path, ~file.size(.x))
-
-# use the recommend_nchunks function to get a chunksize based on your RAM and
-# number of CPU cores
-nchunks = sum(file_sizes) %>% recommend_nchunks(type="csv")
-
-relative_file_path = relative_file_path[order(file_sizes, decreasing = T)]
-full_file_path = full_file_path[order(file_sizes, decreasing = T)]
-
-l = length(full_file_path)
-
-pt <- proc.time()
-system.time(lapply(1:l, function(i) {
-#system.time(future_lapply(1:6, function(i) {
-  relative_file_pathi = relative_file_path[i]
-  full_file_path
-  csv_to_disk.frame(
-    full_file_path[i], 
-    glue("test_fm/{relative_file_path[i]}"), 
-    shardby="loan_id", 
-    nchunks=nchunks, 
-    colClasses = Performance_ColClasses, 
-    col.names = Performance_Variables, 
-    sep="|", 
-    compress=compress,
-    overwrite = T)
-}))
-print(timetaken(pt))
\ No newline at end of file
diff --git a/tutorials/fannie_mae/01d_OPTIONAL_rechunk.r b/tutorials/fannie_mae/01d_OPTIONAL_rechunk.r
deleted file mode 100644
index 1ee3669c..00000000
--- a/tutorials/fannie_mae/01d_OPTIONAL_rechunk.r
+++ /dev/null
@@ -1,5 +0,0 @@
-library(disk.frame)
-
-fmdf <- disk.frame("fmdf")
-
-rechunk(fmdf, 128)
diff --git a/tutorials/fannie_mae/01e_read_data_for_harp.r b/tutorials/fannie_mae/01e_read_data_for_harp.r
deleted file mode 100644
index b9f1173c..00000000
--- a/tutorials/fannie_mae/01e_read_data_for_harp.r
+++ /dev/null
@@ -1,85 +0,0 @@
-#1c2
-source("tutorials/fannie_mae/00_setup.r")
-
-pt <- proc.time()
-
-# rows to read in one go
-rows_to_read = 1e7
-
-# load the Fannie Mae disk.frame
-fmdf <- disk.frame(file.path(outpath, "fm.df"))
-
-#system.time(harp <- fread("C:/data/HARP_Files/Performance_HARP.txt", colClasses = Performance_ColClasses, col.names = Performance_Variables))
-harp_mapping <- fread(file.path(raw_harp_data_path,"Loan_Mapping.txt"), colClasses = "c", col.names = c("loan_id", "harp_id"))
-setkey(harp_mapping, harp_id)
-fst::write_fst(harp_mapping,"harp_mapping.fst.tmp")
-fs::file_move("harp_mapping.fst.tmp", "harp_mapping.fst")
-print("reading in and saving HARP mapping file took: ")
-print(data.table::timetaken(pt))
-
-# took about 438.65 on laptop 500 chunks
-# took about 205 on desktop 56 chunks
-pt <- proc.time()
-system.time(
-  harp <- csv_to_disk.frame(file.path(raw_harp_data_path,"Performance_HARP.txt"), inmapfn = function(df) {
-    setnames(df, "loan_id", "harp_id")
-    
-    merge(df, harp_mapping, by="harp_id")
-  },
-  nchunks = nchunks(fmdf),
-  in_chunk_size = rows_to_read,
-  shardby = "loan_id",
-  outdir = file.path(outpath, "harp.df"),
-  colClasses = Performance_ColClasses,
-  col.names = Performance_Variables,
-  sep="|"))
-
-print("reading in and saving HARP mapping file took: ")
-print(data.table::timetaken(pt))
-
-
-if(F) {
-  # it can be seen that some accounts can start n harp the month the same that it ends in the dataset
-  harp1 = get_chunk.disk.frame(harp,1)
-  fmdf1 = get_chunk.disk.frame(fmdf,1)
-  
-  harp1[,date:=as.Date(monthly.rpt.prd,"%m/%d/%Y")]
-  fmdf1[,date:=as.Date(monthly.rpt.prd,"%m/%d/%Y")]
-  
-  fmdf2 = fmdf1[loan_id %in% unique(harp1$loan_id)]
-  
-  fmdf3 = merge(
-    fmdf2[,.(max_date = max(date)),loan_id],
-    harp1[,.(min_date = min(date)),loan_id],
-    by = "loan_id"
-  )
-  
-  fmdf3[max_date == min_date,]
-  
-  fmdf3[max_date > min_date,]
-  fmdf3[max_date <= min_date,]
-  
-  harp4 = harp[,.N, delq.status, keep="delq.status"]
-}
-
-
-# print(nrow(harp))
-# system.time(harp <- merge(harp, harp_mapping, by="harp_id"))
-# print(nrow(harp))
-# setkey(harp,"loan_id")
-# system.time(fst::write_fst(harp,"harp.fst")) # 16.63
-#system.time(fst::write_fst(harp,"harp100.fst",100)) # 391 seconds
-
-# check if the harp_id -> loan_id is successful
-if(F) {
-  harp_acq <- fread("C:/data/HARP_Files/Acquisition_HARP.txt")
-  fst::write_fst(harp_mapping,"harp_mapping.fst")
-  
-  harp[,date:=as.Date(month,"%m/%d/%Y")]
-  harp[,.N,date][order(date)]
-  fmdf = disk.frame("fmdf")
-  system.time(uid <- fmdf[,.(loan_id = unique(loan_id)), keep = "loan_id"][,unique(loan_id)])
-  harp_uid = unique(harp_mapped$loan_id)
-  def = intersect(uid, harp_uid)
-  def  
-}
\ No newline at end of file
diff --git a/tutorials/fannie_mae/01f_append_harp_to_data_DEPRECATED.r b/tutorials/fannie_mae/01f_append_harp_to_data_DEPRECATED.r
deleted file mode 100644
index 3d344ac8..00000000
--- a/tutorials/fannie_mae/01f_append_harp_to_data_DEPRECATED.r
+++ /dev/null
@@ -1,40 +0,0 @@
-#2d_harp_check.r
-source("inst/fannie_mae/0_setup.r")
-
-# system.time(harp <- fread("D:/data/fannie_mae/HARP_Files/Performance_HARP.txt", select = c("V1","V2"), colClasses = "c"))
-# setnames(harp, names(harp), c("harp_id","month"))
-# 
-# harp_mapping <- fread("D:/data/fannie_mae/HARP_Files/Loan_Mapping.txt", colClasses = "c", col.names = c("loan_id", "harp_id"))
-# fst::write_fst(harp_mapping,"harp_mapping.fst")
-# 
-# harp_mapped = merge(harp, harp_mapping, by="harp_id")
-# c(nrow(harp), nrow(harp_mapped))
-# 
-# harp_acq <- fread("d:/data/fannie_mae/HARP_Files/Acquisition_HARP.txt")
-# fst::write_fst(harp_mapping,"harp_mapping.fst")
-# 
-# harp[,date:=as.Date(month,"%m/%d/%Y")]
-# harp[,.N,date][order(date)]
-
-system.time(harp <- fst::read_fst("harp.fst"))
-
-fmdf = disk.frame("fmdf")
-
-if(F) {
-  # this was run once to confimr that the 
-}
-system.time(uid <- fmdf[,.(loan_id = unique(loan_id)), keep = "loan_id"][,unique(loan_id)])
-harp_uid = unique(harp_mapped$loan_id)
-
-def = intersect(uid, harp_uid)
-def
-
-system.time(harp.df <- shard(harp,"loan_id",500, "harp.df", overwrite = T, append = F))
-
-system.time(fmdfh <- rbindlist.disk.frame(list(fmdf, harp.df), outdir = "fmdf1_w_harp"))
-
-# nothing in the intersection
-# so either HARP ids were changed once there are in the harp program
-
-
-
diff --git a/tutorials/fannie_mae/01g_compress_it_DEPRECATED.r b/tutorials/fannie_mae/01g_compress_it_DEPRECATED.r
deleted file mode 100644
index 937bd874..00000000
--- a/tutorials/fannie_mae/01g_compress_it_DEPRECATED.r
+++ /dev/null
@@ -1,10 +0,0 @@
-source("inst/fannie_mae/00_setup.r")
-
-# create a most compressed set
-a = disk.frame("fmdf")
-chunk_lapply(a,function(x) x, outdir="fmdf0", chunks=500, compress=100)
-
-a1 = disk.frame("fmdf0")
-
-system.time(print(a[,.N,keep="delq.status"] %>% sum))
-system.time(print(a1[,.N,keep="delq.status"] %>% sum))
diff --git a/tutorials/fannie_mae/02_exploratory.r b/tutorials/fannie_mae/02_exploratory.r
deleted file mode 100644
index 2a362d81..00000000
--- a/tutorials/fannie_mae/02_exploratory.r
+++ /dev/null
@@ -1,19 +0,0 @@
-# 2_exploratory.r
-source("inst/fannie_mae/0_setup.r")
-
-fmdf = disk.frame("fmdf0")
-
-system.time(a <- fmdf[,.N,delq.status,keep="delq.status"][,.(N=sum(N)),delq.status])
-setkey(a,delq.status); a
-
-str(a)
-
-str(head(fmdf))
-
-b = fmdf[,.N,monthly.rpt.prd,keep="monthly.rpt.prd"][,sum(N),monthly.rpt.prd]; b
-
-
-names(fmdf)
-servicer.name_cnt = fmdf[,.N,servicer.name,keep=c("servicer.name")]
-
-servicer.name_cnt[,sum(N),servicer.name]
diff --git a/tutorials/fannie_mae/02a_create_forward_looking_flag.r b/tutorials/fannie_mae/02a_create_forward_looking_flag.r
deleted file mode 100644
index 2d7dc6bc..00000000
--- a/tutorials/fannie_mae/02a_create_forward_looking_flag.r
+++ /dev/null
@@ -1,97 +0,0 @@
-# 2_exploratory.r
-source("inst/fannie_mae/0_setup.r")
-
-fmdf = disk.frame(file.path(outpath, "fm.df"))
-harp = disk.frame(file.path(outpath, "harp.df"))
-
-# harp contains the first date on which a loan enters into HARP
-pt <- proc.time()
-harp1 <- harp %>% 
-  srckeep(c("loan_id","monthly.rpt.prd")) %>% 
-  group_by(loan_id, hard = F) %>% # the data is sharded by loan_id hence hard = F is fine
-  summarise(first_harp_date = min(as.Date(monthly.rpt.prd, "%m/%d/%Y"))) %>% 
-  collect(parallel = T) # performs the collection in parallel
-
-cat(glue("Creating first harp date took: {timetaken(pt)}"))
-
-if(F) {
-  # data.table syntax
-  harp[,.(first_harp_date = min(as.Date(monthly.rpt.prd, "%m/%d/%Y"))), loan_id, 
-       keep=c("loan_id", "monthly.rpt.prd")]
-}
-
-harp1[,before_12m_first_harp_date:=first_harp_date]
-# 164 seconds
-system.time(lubridate::month(harp1$before_12m_first_harp_date) <- lubridate::month(harp1$before_12m_first_harp_date) - 12)
-harp1[,harp_12m := TRUE]
-
-# break it out into smaller chunks for even faster merging
-# took 27
-pt <- proc.time()
-harp2 <- shard(
-  harp1, 
-  "loan_id", 
-  nchunks = nchunks(fmdf), 
-  outdir = file.path(outpath, "first_harp_date.df"), 
-  overwrite=T)
-cat(glue::glue("sharding HARP defaults took {timetaken(pt)}"))
-
-# no need to hard group by it's already sharded by loan_id
-# took about 503 seconds
-# took about 11:52
-pt <- proc.time()
-defaults <- cmap(fmdf, function(df) {
-  # create the default flag
-  df[,date:=as.Date(monthly.rpt.prd,"%m/%d/%Y")]
-  setkey(df,loan_id, date)
-  df[,delq.status := as.integer(delq.status)]
-  df[,default:=delq.status>=3]
-  df[is.na(default),default:=F]
-  
-  df2 = df[default==T,.(loan_id, start_date =date, end_date = date)]
-  lubridate::month(df2$start_date) <- lubridate::month(df2$start_date)-12
-  
-  # get rid of overlaps
-  df2[order(start_date),lag_end_date := shift(end_date,1), loan_id]
-  
-  df2[,grp_incr := 1]
-  df2[start_date <= lag_end_date, grp_incr := 0, loan_id]
-  
-  df2[,grp:=cumsum(grp_incr)]
-  df3 = df2[,.(start_date = min(start_date), end_date = max(end_date), default_12m=T),.(loan_id,grp)]
-  
-  df3[,grp:=NULL]
-  
-  # create the hardship flag
-  # whether the customer goes into in the next 12 months hardship 
-  df3
-  }, 
-  keep = c("monthly.rpt.prd", "delq.status", "loan_id"), 
-  outdir = file.path(outpath, "defaults.df"),
-  lazy = F, 
-  overwrite = T)
-cat(glue::glue("Creating forward looking default flag took {timetaken(pt)}"))
-
-# 
-# a = df[loan_id == "100513171914", ]
-# 
-# a[,monthly.rpt.prd := as.Date(monthly.rpt.prd,"%m/%d/%Y")]
-# a[,monthly.rpt.prd2:=monthly.rpt.prd]
-# 
-# 
-# setkey(df3,"loan_id","start_date","end_date")
-# 
-# a1= foverlaps(a, df3, by.x = c("loan_id","monthly.rpt.prd","monthly.rpt.prd2"),
-#           by.y=c("loan_id","start_date","end_date"))
-# 
-# a1[,.(loan_id,monthly.rpt.prd,start_date, end_date)]
-
-# create a list that show defaults ----------------------------------------
-# system.time(defaults <- chunk_lapply(fmdf, function(df) {
-#   df[,date:=as.Date(monthly.rpt.prd,"%m/%d/%Y")]
-#   df[,delq.status := as.integer(delq.status)]
-#   df[,default:=delq.status>=3]
-#   df[is.na(default),default:=F]
-#   df1 = df[default == F]
-#   df1
-# }, outdir="tmp1"))
diff --git a/tutorials/fannie_mae/02b_foverlaps.r b/tutorials/fannie_mae/02b_foverlaps.r
deleted file mode 100644
index a89d107a..00000000
--- a/tutorials/fannie_mae/02b_foverlaps.r
+++ /dev/null
@@ -1,45 +0,0 @@
-# 2_exploratory.r
-source("inst/fannie_mae/0_setup.r")
-
-fmdf = disk.frame(file.path(outpath, "fm.df"))
-tmp2 = disk.frame(file.path(outpath, "defaults.df"))
-harp2 = disk.frame(file.path("first_harp_date.df"))
-
-fmdf_lazy = delayed(fmdf, function(df) {
-  df[,monthly.rpt.prd:=as.Date(monthly.rpt.prd,"%m/%d/%Y")]
-  df[,monthly.rpt.prd2:=monthly.rpt.prd]
-})
-
-tmp2_lazy = delayed(tmp2, function(df) {
-  setkey(df, "loan_id","start_date","end_date")
-})
-
-harp2_lazy = delayed(harp2, function(df) {
-  setkey(df, "loan_id","before_12m_first_harp_date","first_harp_date")
-})
-
-# took 513 seconds
-pt <- proc.time()
-tmp3 <- foverlaps.disk.frame(
-  fmdf_lazy, 
-  tmp2_lazy, 
-  outdir=file.path(outpath, "fm_with_default"), 
-  by.x = c("loan_id", "monthly.rpt.prd", "monthly.rpt.prd2"),
-  by.y = c("loan_id", "start_date", "end_date"),
-  merge_by_chunk_id = T,
-  overwrite = T
-)
-cat(glue::glue("time taken to merge on default flag {timetaken(pt)}"))
-
-# took 6:55
-pt <- proc.time()
-tmp4 <- foverlaps.disk.frame(
-  tmp3, 
-  harp2_lazy, 
-  outdir=file.path(outpath, "fm_with_harp"), 
-  by.x = c("loan_id", "monthly.rpt.prd", "monthly.rpt.prd2"),
-  by.y = c("loan_id", "before_12m_first_harp_date", "first_harp_date"),
-  merge_by_chunk_id = T,
-  overwrite = T
-)
-cat(glue::glue("time taken to merge on HARP flag {timetaken(pt)}"))
\ No newline at end of file
diff --git a/tutorials/fannie_mae/02b_foverlaps_Reduced16.r b/tutorials/fannie_mae/02b_foverlaps_Reduced16.r
deleted file mode 100644
index 825d28d4..00000000
--- a/tutorials/fannie_mae/02b_foverlaps_Reduced16.r
+++ /dev/null
@@ -1,68 +0,0 @@
-# 2_exploratory.r
-source("inst/fannie_mae/0_setup.r")
-pt <- proc.time()
-
-fmdf = disk.frame("only16.df")
-tmp2 = disk.frame("defaults.df") # tmp2 contains all the defaults
-harp2 = disk.frame("first_harp_date.df")
-
-fmdf_lazy = delayed(fmdf, function(df) {
-  df[,monthly.rpt.prd:=as.Date(monthly.rpt.prd,"%m/%d/%Y")]
-  df[,monthly.rpt.prd2:=monthly.rpt.prd]
-})
-
-tmp2_lazy = delayed(tmp2, function(df) {
-  setkey(df, "loan_id","start_date","end_date")
-})
-
-harp2_lazy = lazy(harp2, function(df) {
-  setkey(df, "loan_id","before_12m_first_harp_date","first_harp_date")
-})
-
-#dh <- rbindlist.disk.frame(tmp2_lazy, harp2_lazy, outdir ="defaults_harp.df", by_chunk_id = T)
-#plan(transparent)
-system.time(tmp3 <- foverlaps.disk.frame(
-  fmdf_lazy, 
-  tmp2_lazy, 
-  outdir="tmp3_16", 
-  by.x = c("loan_id", "monthly.rpt.prd", "monthly.rpt.prd2"),
-  by.y = c("loan_id", "start_date", "end_date"),
-  merge_by_chunk_id = T,
-  compress = 50
-))
-
-
-system.time(tmp4 <- foverlaps.disk.frame(
-  tmp3, 
-  harp2_lazy, 
-  outdir="tmp4_16", 
-  by.x = c("loan_id", "monthly.rpt.prd", "monthly.rpt.prd2"),
-  by.y = c("loan_id", "before_12m_first_harp_date", "first_harp_date"),
-  merge_by_chunk_id = T,
-  compress = 50
-))
-
-data.table::timetaken(pt)
-
-
-tmp4 <- disk.frame("tmp4_16")
-system.time(tmp5 <- tmp4[,.(ndef=sum(default_12m, na.rm=T),.N), .(monthly.rpt.prd, delq.status),
-     keep=c("default_12m","monthly.rpt.prd","delq.status")])
-
-
-tmp5[, delq.status.cap := pmin(5, as.integer(delq.status))]
-tmp5[delq.status == "X", delq.status.cap := 0]
-tmp5[delq.status == "", delq.status.cap := 0]
-
-
-tmp6 <- tmp5[,.(ndef=sum(ndef), N=sum(N)),.(monthly.rpt.prd, delq.status.cap)]
-
-
-tmp6[,defr:=ndef/N]
-
-library(ggplot2)
-tmp6 %>% 
-  ggplot +
-  geom_line(aes(x=monthly.rpt.prd, y = defr, colour=as.factor(delq.status.cap)))
-
-
diff --git a/tutorials/fannie_mae/02c_plot_odr.r b/tutorials/fannie_mae/02c_plot_odr.r
deleted file mode 100644
index 4521511a..00000000
--- a/tutorials/fannie_mae/02c_plot_odr.r
+++ /dev/null
@@ -1,45 +0,0 @@
-# 2_exploratory.r
-source("inst/fannie_mae/0_setup.r")
-
-fm_with_harp = disk.frame(file.path(outpath, "fm_with_harp"))
-
-head(fm_with_harp)
-
-# need a two stage summary
-system.time(a_wh1 <- fm_with_harp %>% 
-  srckeep(c("default_12m","harp_12m","monthly.rpt.prd")) %>% 
-  group_by(monthly.rpt.prd, hard = F) %>% 
-  summarise(
-    N = n(), 
-    n_defaults = sum(default_12m, na.rm = T),
-    n_goto_harp = sum(harp_12m, na.rm=T)) %>% 
-  collect(parallel  = T) %>%
-  group_by(monthly.rpt.prd) %>% 
-  summarise(
-    odr = sum(n_defaults)/sum(N),
-    oh = sum(n_goto_harp)/sum(N)
-  ) %>% 
-  rename(
-    Date = monthly.rpt.prd,
-    `Observed Default Rate%` = odr,
-    `HARP Conversion Rate%` = oh
-  ))
-  
-
-if(F) {
-  # data.table syntax
-  system.time(a_wh <- tmp4[
-    ,.(.N, n_defaults = sum(default_12m, na.rm = T), )
-    ,.(monthly.rpt.prd), keep=c("default_12m","harp_12m","monthly.rpt.prd")])
-  
-  a_wh1 = a_wh[,.(odr = sum(n_defaults)/sum(N), oh = sum(n_goto_harp)/sum(N)), monthly.rpt.prd]
-  setnames(a_wh1, c("monthly.rpt.prd","odr", "oh"), c("Date", "Observed Default Rate%", "HARP Conversion Rate%"))
-}
-
-a_wh2 = a_wh1 %>% gather(key = type, value=rate, -Date)
-
-ggplot(a_wh2) + 
-  geom_line(aes(x=Date, y = rate, colour = type)) +
-  ggtitle("Fannie Mae Observed Default Rate over time & HARP Conversion Rate")
-
-
diff --git a/tutorials/fannie_mae/02d_harp_check.r b/tutorials/fannie_mae/02d_harp_check.r
deleted file mode 100644
index f9fd020a..00000000
--- a/tutorials/fannie_mae/02d_harp_check.r
+++ /dev/null
@@ -1,33 +0,0 @@
-#2d_harp_check.r
-source("inst/fannie_mae/0_setup.r")
-
-system.time(harp <- fread("D:/data/fannie_mae/HARP_Files/Performance_HARP.txt", select = c("V1","V2"), colClasses = "c"))
-setnames(harp,names(harp), c("harp_id","month"))
-
-harp_mapping <- fread("D:/data/fannie_mae/HARP_Files/Loan_Mapping.txt", colClasses = "c", col.names = c("loan_id", "harp_id"))
-fst::write_fst(harp_mapping,"harp_mapping.fst")
-
-harp_mapped = merge(harp, harp_mapping, by="harp_id")
-c(nrow(harp), nrow(harp_mapped))
-
-harp_acq <- fread("d:/data/fannie_mae/HARP_Files/Acquisition_HARP.txt")
-fst::write_fst(harp_mapping,"harp_mapping.fst")
-
-harp[,date:=as.Date(month,"%m/%d/%Y")]
-harp[,.N,date][order(date)]
-
-fmdf = disk.frame("fmdf")
-
-system.time(uid <- fmdf[,.(loan_id = unique(loan_id)), keep = "loan_id"][,unique(loan_id)])
-
-harp_uid = unique(harp_mapped$loan_id)
-
-def = intersect(uid, harp_uid)
-def
-
-system.time(harp.df <- shard(harp,"loan_id", nchunks = nchunks(fmdf), outdir = "harp.df", overwrite = T))
-
-system.time(fmdfh <- rbindlist.disk.frame(list(fmdf, harp.df), outdir = "fmdf1_w_harp"))
-
-# nothing in the intersection
-# so either HARP ids were changed once there are in the harp program
\ No newline at end of file
diff --git a/tutorials/fannie_mae/10_a_one_var.R b/tutorials/fannie_mae/10_a_one_var.R
deleted file mode 100644
index fae7540c..00000000
--- a/tutorials/fannie_mae/10_a_one_var.R
+++ /dev/null
@@ -1,191 +0,0 @@
-source("inst/fannie_mae/0_setup.r")
-library(disk.frame)
-
-acqall1 = disk.frame(file.path(outpath, "appl_mdl_data_sampled_dev2"))
-
-# XGBoost on one var ------------------------------------------------------
-system.time(xy <- acqall1[,c("default_next_12m", "oltv"), keep=c("default_next_12m", "oltv")])
-
-# transform into format accepted by XGBoost
-dtrain <- xgb.DMatrix(label = xy$default_next_12m, data = as.matrix(xy[,"oltv"]))
-
-portfolio_default_rate = xy[,sum(default_next_12m, na.rm = T)/.N]
-
-pt = proc.time()
-m <- xgboost(
-  data=dtrain, 
-  nrounds = 1, 
-  objective = "binary:logitraw", 
-  tree_method="exact",
-  monotone_constraints = 1,
-  base_score = portfolio_default_rate)
-timetaken(pt)
-
-prev_pred = predict(m, dtrain)
-
-map_chr(xgb.dump(m), ~str_extract(.x,"\\[f0<[\\d]+\\.[\\d]+\\]")) %>% 
-  keep(~!is.na(.x)) %>% 
-  map_dbl(~str_extract(.x, "[\\d]+\\.[\\d]+") %>% as.numeric) %>% 
-  sort %>% 
-  floor -> bins
-
-bb = xy[,.(ndef = sum(default_next_12m), .N, m1 = min(oltv), m2 = max(oltv)), .(bins = cut(oltv,c(-Inf,bins,Inf)))]
-new_bins = sort(unique( bb$m2))
-bb = xy[,.(ndef = sum(default_next_12m), .N), .(bins = cut(oltv,c(-Inf,new_bins,Inf)))][order(bins)]
-
-setkey(bb, bins)
-bb %>%
-  filter(!is.na(bins)) %>% 
-  mutate(`Orig LTV Band` = bins, `Default Rate%` = ndef/N) %>% 
-  ggplot +
-  geom_bar(aes(x = `Orig LTV Band`, y = `Default Rate%`), stat = 'identity') + 
-  coord_flip()
-
-if(F) {
-  system.time(xyz <- acqall1[,c("default_next_12m", "oltv", "frst_dte"), keep=c("default_next_12m", "oltv", "frst_dte")])
-  
-  xyz[,frst_yr := substr(frst_dte,4,7) %>% as.integer]
-  xyz[,frst_dte:=NULL]
-  
-  bb = xyz[,.(ndef = sum(default_next_12m), .N), .(bins = cut(oltv,c(-Inf,bins,Inf)), frst_yr)]
-  
-  bb[,binton := sum(N), bins]
-  bb[,dr := ndef/binton]
-  
-  setkey(bb, bins)
-  bb[order(frst_yr,decreasing = T),text_y_pos := cumsum(dr) - dr/2, bins]
-  bb[,text := substr(frst_yr, 3,4)]
-  
-  bb %>%
-    filter(!is.na(bins)) %>% 
-    mutate(`Yr of Orig` = as.factor(frst_yr), `Orig LTV Band` = bins, `Default Rate%` = dr) %>% 
-    ggplot +
-    geom_bar(aes(x = `Orig LTV Band`, y = `Default Rate%`, fill = `Yr of Orig`), stat = 'identity') + 
-    geom_text(aes(x = `Orig LTV Band`, y = text_y_pos, label = text)) +
-    coord_flip()
-}
-
-
-# what if we fit them in rounded number ---------------------------------------------------------
-
-xy[oltv >  80, oltv_round := ceiling(oltv/5)*5]
-xy[oltv <= 80, oltv_round := ceiling(oltv/10)*10]
-xy[oltv <= 40, oltv_round := ceiling(oltv/20)*20]
-#xy = xy[!is.na(oltv_round),]
-
-xy[is.na(oltv_round),]
-
-xy[,.N, oltv_round]
-
-# transform into format accepted by XGBoost
-dtrain <- xgb.DMatrix(label = xy$default_next_12m, data = as.matrix(xy[,"oltv_round"]))
-
-pt = proc.time()
-m <- xgboost(
-  data=dtrain, 
-  nrounds = 1, 
-  objective = "binary:logitraw", 
-  tree_method="exact",
-  monotone_constraints = 1,
-  base_score = portfolio_default_rate)
-timetaken(pt)
-
-map_chr(xgb.dump(m), ~str_extract(.x,"\\[f0<[\\d]+[\\.]{0,1}[\\d]+\\]")) %>% 
-  keep(~!is.na(.x)) %>% 
-  map_dbl(~str_extract(.x, "[\\d]+[\\.]{0,1}[\\d]+") %>% as.numeric) %>% 
-  sort -> bins
-
-bb = xy[,.(ndef = sum(default_next_12m), .N, m1 = min(oltv_round), m2 = max(oltv_round)), .(bins = cut(oltv_round,c(-Inf,bins,Inf)))]
-new_bins = sort(unique( bb$m2))
-bb = xy[,.(ndef = sum(default_next_12m), .N), .(bins = cut(oltv_round,c(-Inf,new_bins,Inf)))][order(bins)]
-bb[,odr := ndef/N]
-
-setkey(bb, bins)
-bb %>%
-  filter(!is.na(bins)) %>% 
-  mutate(`Orig LTV Band` = bins, `Default Rate%` = ndef/N) %>% 
-  ggplot +
-  geom_bar(aes(x = `Orig LTV Band`, y = `Default Rate%`), stat = 'identity') + 
-  coord_flip()
-
-
-prev_pred = predict(m, dtrain)
-
-prev_pred1 = predict(m, dtrain, predcontrib=T)
-
-# add another variable ----------------------------------------------------
-target = "default_next_12m"
-feature = "orig_amt"
-df = acqall1
-format_fn = base::I
-existing_model = prev_pred
-monotone_constraints = -1
-
-# auc <- function(target, score) {
-#   df = data.table(target, score)
-#   
-#   df1 = df[,.(nt = sum(target), n = .N, score)]
-#   setkey(df1, score)
-# }
-# 
-# add_var_to_scorecard <- function(df, target, feature, monotone_constraints = 0, prev_pred = NULL, format_fn = base::I) {
-#   
-#   xy = df %>% 
-#     srckeep(c(target, feature)) %>%
-#     collect(parallel = T)
-#   
-#   # evaluate
-#   code = glue::glue("xy = xy %>% mutate({feature} = format_fn({feature}))")
-#   eval(parse(text = code))
-#   
-#   dtrain <- xgb.DMatrix(label = xy[,target, with = F][[1]], data = as.matrix(xy[,c(feature), with = F]))
-#   
-#   if(is.null(prev_pred)) {
-#     pt = proc.time()
-#     m2 <- xgboost(
-#       data=dtrain, 
-#       nrounds = 1, 
-#       objective = "binary:logitraw", 
-#       tree_method="exact",
-#       monotone_constraints = monotone_constraints
-#     )
-#     timetaken(pt)
-#   } else {
-#     setinfo(dtrain, "base_margin", prev_pred)
-#     pt = proc.time()
-#     m2 <- xgboost(
-#       data=dtrain, 
-#       nrounds = 1, 
-#       objective = "binary:logitraw", 
-#       tree_method="exact",
-#       monotone_constraints = monotone_constraints
-#     )
-#     timetaken(pt)
-#     
-#     a2 = predict(m2, dtrain)
-#     a3 = predict(m2, dtrain, predcontrib = T)
-#   }
-#   
-#   map_chr(xgb.dump(m2), ~str_extract(.x,"\\[f0<[\\d]+[\\.]{0,1}[\\d]+\\]")) %>% 
-#     keep(~!is.na(.x)) %>% 
-#     map_dbl(~str_extract(.x, "[\\d]+[\\.]{0,1}[\\d]+") %>% as.numeric) %>% 
-#     sort -> bins
-#   
-#   code = glue::glue("bb = xy[,.(ndef = sum({target}), .N, m1 = min({feature}), m2 = max({feature})), .(bins = cut({feature},c(-Inf,bins,Inf)))]")
-#   eval(parse(text = code))
-#   
-#   new_bins = sort(unique(bb$m2))
-#   code1 = glue::glue("bb = xy[,.(ndef = sum(default_next_12m), .N), .(bins = cut({feature},c(-Inf,new_bins,Inf)))][order(bins)]")
-#   eval(parse(text = code1))
-#   
-#   setkey(bb, bins)
-#   bb %>%
-#     filter(!is.na(bins)) %>% 
-#     mutate(`Orig LTV Band` = bins, `Default Rate%` = ndef/N) %>% 
-#     ggplot +
-#     geom_bar(aes(x = `Orig LTV Band`, y = `Default Rate%`), stat = 'identity') + 
-#     coord_flip()
-#   
-#   
-# }
-
diff --git a/tutorials/fannie_mae/10_a_one_var_with_fn.R b/tutorials/fannie_mae/10_a_one_var_with_fn.R
deleted file mode 100644
index 687abcc7..00000000
--- a/tutorials/fannie_mae/10_a_one_var_with_fn.R
+++ /dev/null
@@ -1,56 +0,0 @@
-source("inst/fannie_mae/0_setup.r")
-library(disk.frame)
-
-acqall1 = disk.frame(file.path(outpath, "appl_mdl_data_sampled_dev2"))
-
-pt <- proc.time()
-firstm <- add_var_to_scorecard(acqall1, "default_next_12m", "oltv", monotone_constraints = 1, format_fn = function(v) {
-  ceiling(v / 5) * 5
-})
-timetaken(pt)
-
-firstm
-
-# now add the second variable ---------------------------------------------
-pt <- proc.time()
-secondm <- add_var_to_scorecard(
-  acqall1, 
-  "default_next_12m", 
-  "dti", 
-  prev_pred = firstm$prev_pred, 
-  monotone_constraints = 1)
-timetaken(pt)
-
-pt <- proc.time()
-thirdm <- add_var_to_scorecard(
-  acqall1, 
-  "default_next_12m", 
-  "orig_trm", 
-  prev_pred = secondm$prev_pred, 
-  monotone_constraints = 1)
-timetaken(pt)
-
-pt <- proc.time()
-fourthm <- add_var_to_scorecard(
-  acqall1, 
-  "default_next_12m", 
-  "num_bo", 
-  prev_pred = thirdm$prev_pred, 
-  monotone_constraints = -1,
-  format_fn = as.numeric)
-timetaken(pt)
-
-pt <- proc.time()
-fifthm <- add_var_to_scorecard(
-  acqall1, 
-  "default_next_12m", 
-  "cscore_b", 
-  prev_pred = fourthm$prev_pred, 
-  monotone_constraints = -1)
-timetaken(pt)
-
-
-# plot improvement --------------------------------------------------------
-plot(map_dbl(list(firstm, secondm, thirdm, fourthm, fifthm), ~.x$auc), type="b", ylim = c(0.4,0.9))
-abline(h = 0.7, lty = 2)
-
diff --git a/tutorials/fannie_mae/10_b_two_vars.r b/tutorials/fannie_mae/10_b_two_vars.r
deleted file mode 100644
index d0df6cd4..00000000
--- a/tutorials/fannie_mae/10_b_two_vars.r
+++ /dev/null
@@ -1,61 +0,0 @@
-source("inst/fannie_mae/0_setup.r")
-library(disk.frame)
-
-acqall1 = disk.frame(file.path(outpath, "appl_mdl_data_sampled_dev2"))
-
-pt <- proc.time()
-firstm <- add_var_to_scorecard(
-  acqall1, 
-  "default_next_12m", 
-  "oltv", 
-  monotone_constraints = 1, 
-  format_fn = function(v) {
-    ceiling(v / 5) * 5
-  })
-timetaken(pt)
-
-firstm
-
-# now add the second variable ---------------------------------------------
-pt <- proc.time()
-secondm <- add_var_to_scorecard(
-  acqall1, 
-  "default_next_12m", 
-  "dti", 
-  prev_pred = firstm$prev_pred, 
-  monotone_constraints = 1)
-timetaken(pt)
-
-pt <- proc.time()
-thirdm <- add_var_to_scorecard(
-  acqall1, 
-  "default_next_12m", 
-  "orig_trm", 
-  prev_pred = secondm$prev_pred, 
-  monotone_constraints = 1)
-timetaken(pt)
-
-pt <- proc.time()
-fourthm <- add_var_to_scorecard(
-  acqall1, 
-  "default_next_12m", 
-  "num_bo", 
-  prev_pred = thirdm$prev_pred, 
-  monotone_constraints = -1,
-  format_fn = as.numeric)
-timetaken(pt)
-
-pt <- proc.time()
-fifthm <- add_var_to_scorecard(
-  acqall1, 
-  "default_next_12m", 
-  "cscore_b", 
-  prev_pred = fourthm$prev_pred, 
-  monotone_constraints = -1)
-timetaken(pt)
-
-
-# plot improvement --------------------------------------------------------
-#plot(map_dbl(list(firstm, secondm, thirdm, fourthm, fifthm), ~.x$auc), type="b", ylim = c(0.4,0.9))
-#abline(h = 0.7, lty = 2)
-
diff --git a/tutorials/fannie_mae/10a_AutoML_XGBoost_Scorecards.r b/tutorials/fannie_mae/10a_AutoML_XGBoost_Scorecards.r
deleted file mode 100644
index bd708ed5..00000000
--- a/tutorials/fannie_mae/10a_AutoML_XGBoost_Scorecards.r
+++ /dev/null
@@ -1,90 +0,0 @@
-source("inst/fannie_mae/0_setup.r")
-library(disk.frame)
-
-acqall_dev = disk.frame(file.path(outpath, "appl_mdl_data_sampled_dev2"))
-  
-
-# 5d develop a function to test all variables
-check_which_is_best <- function(df, target, features, monotone_constraints, format_fns, weight=NULL) {
-  
-  prev_pred = NULL
-  ws = NULL
-  vars_scr = NULL
-  
-  while(length(features) > 0) {
-    # try every var
-    
-    res = furrr::future_map(1:length(features), ~add_var_to_scorecard(
-    #res = purrr::map(1:length(features), ~add_var_to_scorecard(
-      df, 
-      target,
-      features[.x], 
-      monotone_constraints = monotone_constraints[.x],
-      prev_pred = prev_pred,
-      format_fn = format_fns[[.x]],
-      weight,
-      save_model_fname = glue::glue("{features[.x]}.xgbm")
-    ))
-    
-    # which has the best auc
-    w = which.max(map_dbl(res, ~.x$auc))
-    ws = c(ws, w)
-    feature_to_fit = features[w]
-    print(glue::glue("choosen: {feature_to_fit}"))
-    
-    var_scr1 = res[[w]]
-    eval(parse(text = glue::glue("vars_scr = c(vars_scr, list({feature_to_fit} = var_scr1))")))
-    
-    features = features[-w]
-    format_fns = format_fns[-w]
-    monotone_constraints = monotone_constraints[-w]
-    
-    prev_pred = var_scr1$prev_pred
-  }
-  vars_scr
-}
-
-num_vars =     c("mi_pct", "orig_amt", "orig_rt", "ocltv", "mi_type", "cscore_c", "oltv", "dti", "orig_trm", "cscore_b", "num_bo", "num_unit", "orig_dte", "frst_dte")
-num_vars_mon = c(0       ,  1        , 1        , 1      , 0        ,  -1        , 1     , 1    ,     1     ,  -1        ,  -1     , 0         ,       0   ,  0)
-num_var_fmt_fn = c(map(1:10, ~base::I), map(1:2, ~as.numeric), map(1:2, ~function(x) {
-  as.numeric(substr(x, 4, 7))
-}))
-
-cat_vars = setdiff(names(acqall_dev), num_vars) %>% 
-  setdiff(
-    c("loan_id", "default_next_12m", "first_default_date", "mths_to_1st_default", "zip_3", "weight"))
-
-pt = proc.time()
-res_all = check_which_is_best(
-  acqall_dev, 
-  "default_next_12m", 
-  features = c(num_vars, cat_vars),
-  monotone_constraints = c(num_vars_mon, rep(0, length(cat_vars))),
-  format_fns = c(num_var_fmt_fn, map(1:length(cat_vars), ~base::I)),
-  weight = "weight"
-)
-timetaken(pt)
-
-plot(map_dbl(res_all, ~.x$auc))
-
-#saveRDS(res_all, "model.rds")
-
-if(F) {
-  rescat = check_which_is_best(
-    acqall_dev, 
-    "default_next_12m", 
-    c("seller.name", "oltv", "state", "dti", "mi_type"),
-    c(0            , 1     , 0      , 1    , 0     ),
-    map(1:5, ~base::I)
-  )
-  
-  
-  system.time(res <- check_which_is_best(
-    acqall1, 
-    "default_next_12m", 
-    c("ocltv", "cscore_c", "mi_type", "oltv", "dti", "orig_trm", "cscore_b", "num_bo"),
-    c(1      , -1        , 0        , 1     , 1    ,     1     , -1        ,  -1     ),
-    c(map(1:7, ~base::I), list(as.numeric))))
-}
-
-
diff --git a/tutorials/fannie_mae/10b_make_scorecard_visible.R b/tutorials/fannie_mae/10b_make_scorecard_visible.R
deleted file mode 100644
index 36fb832d..00000000
--- a/tutorials/fannie_mae/10b_make_scorecard_visible.R
+++ /dev/null
@@ -1,109 +0,0 @@
-source("inst/fannie_mae/0_setup.r")
-library(disk.frame)
-
-acqall_val = disk.frame(file.path(outpath, "appl_mdl_data_sampled_val2"))
-df = acqall_val
-#the model
-mdl = readRDS("model.rds")
-
-plot(purrr::map_dbl(mdl, ~.x$auc))
-
-#' score one variable
-score_one_var <- function(feature, x, bins, bias, format_fn = base::I) {
-  print(feature)
-  # numeric features (post format_fn) has only two columns in bins
-  if(ncol(bins) == 2) {
-    x1 = format_fn(x)
-    binsNA = evalparseglue("bins[is.na({feature}),]")
-    bins = evalparseglue("bins[!is.na({feature}),]")
-    score = bins$score[cut(x1, c(-Inf, bins[[2]], Inf))]
-    
-    if(nrow(binsNA) == 1) {
-      score[is.na(x1)] = binsNA[[1]]
-    }
-  } else {
-    x1 = format_fn(x)
-    x1df = evalparseglue("data.table({feature} = x1)")
-    x1df_scr = evalparseglue("merge(x1df, bins, by = '{feature}', all.x=T)")
-    score = x1df_scr$score
-  }
-  # if there are still na then assign the bias
-  #browser(expr = any(is.na(score)))
-  if(any(is.na(score))) {
-    warning(glue::glue("the feature `{feature}` has {sum(is.na(score))} unassigned scores out of {length(score)}, assigning them the bias = {bias}"))
-    score[is.na(score)] = bias
-  }
-  
-  score
-}
-
-
-#' Socre a scorecard
-#' @param mdl the xg scorecard model
-#' @param df a disk.frame
-score_xg_scorecard <- function(df, mdl) {
-  res = furrr::future_map_dfc(mdl, ~{
-  #res = purrr::map_dfc(mdl, ~{
-    var = df %>% 
-      srckeep(.x$feature) %>% 
-      collect(parallel = F)
-    x = var[[1]]
-    feature = .x$feature
-    bins = .x$bins
-    bias = .x$bias
-    format_fn = .x$format_fn
-    score_one_var(feature, x, bins, bias, format_fn)
-  }) %>% rowSums
-  res
-}
-
-plot_auc <- function(df) {
-  target = df %>% 
-    srckeep("default_next_12m") %>% 
-    collect(parallel = F)
-  
-  pt = proc.time()
-  target$score = score_xg_scorecard(df, mdl)
-  timetaken(pt)
-  
-  AUC = auc(target$default_next_12m, target$score)
-  GINI = 2*AUC-1
-  
-  target[,negscore := -score]
-  setkey(target, negscore)
-  cts = target[,.(ctot = (1:.N)/.N, cbad = cumsum(default_next_12m)/sum(default_next_12m))]
-  
-  plot(cts[seq(1,.N, length.out=100),], type="l", xlim=c(0,1),
-       main=glue::glue("On Validation - AUC: {AUC %>% round(2)}; GINI: {GINI %>% round(2)}"))
-  abline(a=0,b=1,lty=2)
-  abline(h=0)
-  abline(v=0)
-}
-
-
-system.time(plot_auc(acqall_val))
-
-# score on whole ----------------------------------------------------------
-acqall_dev = disk.frame(file.path(outpath, "appl_mdl_data_sampled_dev2"))
-system.time(plot_auc(acqall_dev))
-
-scorecard = map_dfr(mdl, ~{
-  res = .x$bins
-  evalparseglue("res[,feature_lbl := as.character({.x$feature})]")
-  res[,variable := .x$feature]
-  res %>% 
-    select(variable, feature_lbl, score ) %>% 
-    mutate(score = round(-score*20/log(2)))
-})
-
-saveRDS(scorecard, "scorecard.rds")
-scorecard = readRDS("scorecard.rds")
-View(scorecard)
-
-DT::datatable(scorecard)
-
-# score on whole ----------------------------------------------------------
-# acqall = disk.frame(file.path(outpath, "appl_mdl_data"))
-# system.time(plot_auc(acqall))
-
-
diff --git a/tutorials/fannie_mae/11_speedglm.R b/tutorials/fannie_mae/11_speedglm.R
deleted file mode 100644
index 17b38269..00000000
--- a/tutorials/fannie_mae/11_speedglm.R
+++ /dev/null
@@ -1,50 +0,0 @@
-source("inst/fannie_mae/0_setup.r")
-library(disk.frame)
-library(speedglm)
-library(biglm)
-
-acqall_dev = disk.frame(file.path(outpath, "appl_mdl_data_sampled_dev2"))
-library(speedglm)
-
-head(acqall_dev)
-
-#' A streaming function for speedglm
-#' @param df a disk.frame
-stream_shglm <- function(df) {
-  i = 0
-  is = sample(nchunks(df), replace = F)
-  function(reset = F) {
-    
-    if(reset) {
-      print("you've reset")
-      i <<- 0
-    } else {
-      i <<- i + 1
-      #if (i > 3) {
-      if (i > nchunks(df)) {
-        return(NULL)
-      }
-      print(glue::glue("{i}/{nchunks(df)}"))
-      return(get_chunk(df, is[i]))
-      #return(get_chunk(df, i))
-    }
-  }
-}
-
-acqall_dev1 = acqall_dev %>% delayed(~{
-  .x[,oltv_band := cut(oltv, c(-Inf, seq(60,100,by=20)))]
-  .x[,dti_band := addNA(cut(dti, c(-Inf, c(20,100), Inf)))]
-  .x
-})
-
-streamacq = stream_shglm(acqall_dev1)
-
-m = bigglm(
-  default_next_12m ~ oltv_band-1, 
-  data = streamacq, 
-  family=binomial())
-summary(m)
-
-shglm(default_next_12m ~ oltv_band -1 #+ dti_band + - 1
-      , 
-      datafun = streamacq, family=binomial())
\ No newline at end of file
diff --git a/tutorials/fannie_mae/12_keras.r b/tutorials/fannie_mae/12_keras.r
deleted file mode 100644
index 732f02a3..00000000
--- a/tutorials/fannie_mae/12_keras.r
+++ /dev/null
@@ -1,178 +0,0 @@
-source("inst/fannie_mae/0_setup.r")
-library(disk.frame)
-library(keras)
-
-acqall_dev = disk.frame(file.path(outpath, "appl_mdl_data_sampled_dev2"))
-
-#' A streaming function for speedglm
-#' @param df a disk.frame
-stream_shglm <- function(df) {
-  i = 0
-  is = sample(nchunks(df), replace = F)
-  function(reset = F) {
-    
-    if(reset) {
-      print("you've reset")
-      i <<- 0
-    } else {
-      i <<- i + 1
-      #if (i > 4) {
-      if (i > nchunks(df)) {
-        return(NULL)
-      }
-      print(glue::glue("{i}/{nchunks(df)}"))
-      return(get_chunk(df, is[i]))
-      #return(get_chunk(df, i))
-    }
-  }
-}
-
-acqall_dev1 = acqall_dev %>% delayed(~{
-  .x[,oltv_band := cut(oltv, c(-Inf, 60,80, Inf))]
-  #.x[,scr_band := addNA(cut(cscore_b, c(-Inf, 627,700, Inf)), ifany=F)]
-  .x[,scr_band := addNA(cut(cscore_b, c(-Inf, 700,716,725,742,748,766,794, Inf)), ifany=F)]
-  
-  .x
-})
-
-if(F) {
-  aa = acqall_dev1 %>% collect
-  glm(default_next_12m ~ oltv_band + scr_band - 1, data=aa)
-}
-
-head(acqall_dev1)
-
-build_model <- function() {
-  model <- keras_model_sequential() %>%
-    layer_dense(units = 2, activation = 'softmax')
-  
-  
-  model %>% compile(
-    loss = "categorical_crossentropy",
-    optimizer = 'sgd'
-  )
-  
-  model
-}
-
-model = build_model()
-
-#ol = levels(acqall_dev1[,oltv_band, keep=c("oltv","cscore_b")])
-#dl = levels(acqall_dev1[,scr_band, keep=c("oltv","cscore_b")])
-
-ol = levels(get_chunk(acqall_dev1,1)[,oltv_band])
-dl = levels(get_chunk(acqall_dev1,1)[,scr_band])
-
-
-#for(i in 1:nchunks(acqall_dev1)) {
-kk <- function() {
-  
-  j = 0
-  done = F
-  ii = 0
-  while(!done) {
-    j <- j + 1
-    si = sample(nchunks(acqall_dev1), nchunks(acqall_dev1)*0.7)
-    osi = setdiff(1:nchunks(acqall_dev1), si)
-    
-    system.time(a <- map_dfr(si, ~{
-      
-      ii <- ii + 1
-      i = .x
-      if(ii %% 20 == 0) print(glue::glue("{j}:{ii} {Sys.time()}"))
-      
-      a = get_chunk(acqall_dev1, i)
-      #a[,.(sum(default_next_12m), .N),oltv_band]
-      
-      a1 = 
-        cbind(
-          a[,keras::to_categorical(as.integer(oltv_band) - 1)]
-          #,a[,keras::to_categorical(as.integer(scr_band) - 1)]
-        )
-      
-      at = a[,default_next_12m*1]
-      Y_train = keras::to_categorical(at)
-      
-      hist = model %>% fit(
-        a1,
-        Y_train,
-        epochs = 1,
-        validation_split = 0.2,
-        verbose = 0
-      )
-      
-      gw = get_weights(model)
-      
-      gwdt = as.data.table(gw[1])
-      setnames(gwdt, names(gwdt), c("non_default", "default"))
-      gwdt$i =i
-      gwdt$var = c(
-        rep("oltv_band", length(ol))
-        #, rep("scr_band", length(dl))
-      )
-      gwdt$band = c(ol,dl
-      )
-      gwdt
-    }))
-    
-    some_chunks <- map(osi, ~{
-      i = .x
-      a = get_chunk(acqall_dev1, i)
-      
-      a1 = 
-        cbind(
-          a[,keras::to_categorical(as.integer(oltv_band) -1)]
-          #,a[,keras::to_categorical(as.integer(scr_band)-1)]
-        )
-      a1
-    }) %>% reduce(rbind)
-    
-    outcomes <- map(osi, ~{
-      i = .x
-      a = get_chunk(acqall_dev, i, keep=c("default_next_12m"))
-      a[,default_next_12m]
-    }) %>% unlist
-    
-    auc = auc(outcomes, predict(model, some_chunks)[,2])
-    
-    # scores
-    p = predict(model, diag(
-      length(ol)
-      #+length(dl)
-      ))[,2]
-    a_b = log(p/(1-p))
-    
-    scalar = 20/log(2)
-    intercept = 20*log(536870912/15)/log(2)
-    
-    a = mean(a_b)
-    b = a_b - a
-    done <- length(unique(sign(diff(b)))) == 1
-    
-    scrs = data.table(base_scr = round(a*scalar+intercept,0), scr = round(-b*scalar,0))
-    scrs$var = c(
-      rep("oltv_band", length(ol))
-      #, rep("scr_band", length(dl))
-    )
-    scrs$band = c(ol
-                  , dl
-    )
-    scrs = scrs[band != "bias"]
-    
-    scrs = scrs[,.(var, band, scr, base_scr)]
-    print(glue::glue("AUC: {auc}"))
-    print(scrs)
-  }  
-  scrs
-}
-
-pt = proc.time()
-scrs = kk()
-timetaken(pt)
-View(scrs)
-
-#AUC: 0.599790504924901
-# var      band scr base_scr
-# 1: oltv_band (-Inf,60]  10      351
-# 2: oltv_band   (60,80]  10      351
-# 3: oltv_band (80, Inf] -21      351
\ No newline at end of file
diff --git a/tutorials/fannie_mae/13_disk.r b/tutorials/fannie_mae/13_disk.r
deleted file mode 100644
index 10adc926..00000000
--- a/tutorials/fannie_mae/13_disk.r
+++ /dev/null
@@ -1,25 +0,0 @@
-source("inst/fannie_mae/0_setup.r")
-library(disk.frame)
-library(keras)
-
-fmdf = disk.frame(file.path(outpath, "fm_with_harp"))
-
-head(fmdf)
-
-nrow(fmdf)
-
-pt = proc.time()
-default_rate_over_time <- fmdf %>% 
-  srckeep(c("default_12m", "monthly.rpt.prd")) %>% 
-  group_by(monthly.rpt.prd, hard = F) %>% 
-  summarise(ndef = sum(default_12m, na.rm=T), n = n()) %>% 
-  collect %>% 
-  group_by(monthly.rpt.prd) %>% 
-  summarise(ndef = sum(ndef), n = sum(n)) %>% 
-  mutate(default_rate = ndef/n)
-timetaken(pt)
-
-default_rate_over_time %>% 
-  ggplot +
-  geom_line(aes(x = monthly.rpt.prd, y = default_rate)) +
-  ggtitle("Fannie Mae Default Rate%")
diff --git a/tutorials/fannie_mae/3a_get_some_data.r b/tutorials/fannie_mae/3a_get_some_data.r
deleted file mode 100644
index b86128a8..00000000
--- a/tutorials/fannie_mae/3a_get_some_data.r
+++ /dev/null
@@ -1,113 +0,0 @@
-library(disk.frame)
-
-source("inst/fannie_mae/0_setup.r")
-
-a = disk.frame(file.path(outpath, "fm_with_harp"))
-
-pt = proc.time()
-a1 <- a %>% 
-  srckeep(c("loan_id","monthly.rpt.prd","delq.status", "default_12m")) %>% 
-  delayed(~{
-  setkey(.x, loan_id)
-  uid = sample_frac(.x[,.(loan_id = unique(loan_id))], 0.01)
-  .x[uid,]
-}) %>% collect(parallel = T)
-timetaken(pt)
-
-a1[delq.status != "X",delq.statusn := as.numeric(delq.status)]
-a1[delq.status %in% c("","X"),delq.statusn := 0]
-
-#a1[, delq.statusn := pmin(delq.statusn, 3)]
-
-a1[,.N,delq.statusn]
-
-a1[is.na(delq.statusn)]
-
-# remove those that have already defulat
-a2 = a1
-
-a2[is.na(default_12m), default_12m := FALSE]
-
-a3 = a2[,.(loan_id, monthly.rpt.prd, delq.statusn, default_12m)]
-setkey(a3, loan_id, monthly.rpt.prd)
-
-a3[,sum(default_12m)/.N,delq.statusn]
-
-# create worst delq status last 12 months
-system.time(a4 <- a3[, shift(delq.statusn,n=1:12), by=loan_id])
-
-eval(parse(text=glue::glue("a4[,worst_delq_last_12m := pmax({paste0(paste0('V', 1:12), collapse=',')}, na.rm=T)]")))
-
-a5 = bind_cols(a3, a4[,.(worst_delq_last_12m)])
-
-a6 = a5#[delq.statusn < 3,]
-
-a6[,worst_delq_last_12m_capped := pmin(worst_delq_last_12m, 6)]
-
-a6[,.N, worst_delq_last_12m_capped]
-
-a6devid = sample_frac(a6[,.(loan_id = unique(loan_id))], 0.7)
-
-a6dev = a6[a6devid]
-a6val = a6[!a6devid]
-
-a6devpd = a6dev[,.(pd = sum(default_12m)/.N), worst_delq_last_12m_capped]
-
-a6val = left_join(a6val, a6devpd, by = "worst_delq_last_12m_capped")
-a6dev = left_join(a6dev, a6devpd, by = "worst_delq_last_12m_capped")
-
-setkey(a6val, loan_id, monthly.rpt.prd)
-setkey(a6dev, loan_id, monthly.rpt.prd)
-
-for_write <- function(a6val) {
-  a6val1 = a6val[as.Date("2015-06-01") < monthly.rpt.prd &  monthly.rpt.prd <= as.Date("2016-06-01") & !is.na(worst_delq_last_12m_capped), ]
-  a6val1[,N:=.N,loan_id]
-  
-  a6val2 = a6val1[N == 12,]
-  setkey(a6val2, loan_id, monthly.rpt.prd)
-  
-  toremove = a6val2[,.SD[12,delq.statusn], loan_id]
-   
-  toremove1 = toremove[V1 >= 3, loan_id]
-  
-  a6val2 = a6val2[!(loan_id %in% toremove1), ]
-  
-  a6val2[order(loan_id), id := rleid(loan_id)]
-  setkey(a6val2, loan_id, monthly.rpt.prd)
-  a6val2
-}
-
-
-a6val2 <- for_write(a6val)
-a6dev2 <- for_write(a6dev)
-
-
-a6dev2[,.N, delq.statusn]
-a6val2[,.N, delq.statusn]
-
-fwrite(a6val2[,.(id, delq_status = delq.statusn, default_12m)], "val.csv")
-fwrite(a6dev2[,.(id, delq_status = delq.statusn, default_12m)], "dev.csv")
-
-
-
-gini <- function(a6val) {
-  setkey(a6val, id, monthly.rpt.prd)
-  a6val = a6val[seq(12,.N, by=12),]
-  a6_auc = a6val[order(worst_delq_last_12m_capped), .(bads = sum(default_12m), tots =.N), pd]
-  
-  a6_auc[,score:=-pd]
-  setkey(a6_auc, score)
-  
-  a6_auc[order(score), height := bads/sum(bads)]
-  a6_auc[order(score), width := tots/sum(tots)]
-  plot(a6_auc[,.(cumsum(width), cumsum(height))], type="l")
-  abline(0,1)
-  a6_auc2 = bind_rows(data.table(height=0, width=0), a6_auc)
-  a6_auc2[,cheight := cumsum(height)]
-  a6_auc2[,cwidth := cumsum(width)]
-  2*a6_auc2[,sum((cheight+lag(cheight))*c(0,diff(cwidth))/2, na.rm=T)]-1
-}
-
-gini(a6val2)
-gini(a6dev2)
-
diff --git a/tutorials/fannie_mae/4a_read_appl.r b/tutorials/fannie_mae/4a_read_appl.r
deleted file mode 100644
index 6edcd593..00000000
--- a/tutorials/fannie_mae/4a_read_appl.r
+++ /dev/null
@@ -1,57 +0,0 @@
-source("tutorials/fannie_mae/00_setup.r")
-library(disk.frame)
-
-
-Acquisitions_Variables = c("LOAN_ID", "ORIG_CHN", "Seller.Name", "ORIG_RT", "ORIG_AMT", "ORIG_TRM", "ORIG_DTE"
-                           ,"FRST_DTE", "OLTV", "OCLTV", "NUM_BO", "DTI", "CSCORE_B", "FTHB_FLG", "PURPOSE", "PROP_TYP"
-                           ,"NUM_UNIT", "OCC_STAT", "STATE", "ZIP_3", "MI_PCT", "Product.Type", "CSCORE_C", "MI_TYPE", "RELOCATION_FLG") %>% tolower()
-
-Acquisition_ColClasses = c("character", "character", "character", "numeric", "numeric", "integer", "character", "character", "numeric",
-                           "numeric", "character", "numeric", "numeric", "character", "character", "character", "character", "character",
-                           "character", "character", "numeric", "character", "numeric", "numeric", "character")
-
-
-
-fs::dir_delete(file.path(outpath, "acq.df"))
-
-fmdf <- disk.frame(file.path(outpath,"fm_with_harp"))
-
-pt = proc.time()
-acq <- zip_to_disk.frame(
-  acqzip_file_path, 
-  outdir =  file.path(outpath, "acq.df"),
-  col.names = Acquisitions_Variables, 
-  colClasses = Acquisition_ColClasses,
-  shardby = "loan_id",
-  nchunks = nchunks(fmdf))
-timetaken(pt)
-
-pt = proc.time()
-acqall <- rbindlist.disk.frame(acq, outdir = file.path(outpath, "acq_all.df"))
-timetaken(pt)
-
-# took 50 minutes
-pt = proc.time()
-fmdf_all = left_join(
-  fmdf, 
-  acqall,
-  by = "loan_id", 
-  merge_by_chunk_id = T,
-  outdir = file.path(outpath, "fmdf_appl")
-  )
-timetaken(pt)
-
-acqall = disk.frame(file.path(outpath, "acq_all.df"))
-
-fmdf <- disk.frame(file.path(outpath,"fm_with_harp"))
-
-
-for(i in 1:nchunks(fmdf)) {
-  print(i)
-  a1 = get_chunk(acqall,i, keep="loan_id") %>% unique
-  a2 = get_chunk(fmdf,i, keep="loan_id") %>% unique
-  stopifnot(nrow(a1) == nrow(a2))
-  stopifnot(nrow(setdiff(a1,a2)) == 0)
-  stopifnot(nrow(setdiff(a2,a1)) == 0)
-}
-
diff --git a/tutorials/fannie_mae/4b_exploratory.R b/tutorials/fannie_mae/4b_exploratory.R
deleted file mode 100644
index 1cf4382d..00000000
--- a/tutorials/fannie_mae/4b_exploratory.R
+++ /dev/null
@@ -1,92 +0,0 @@
-source("inst/fannie_mae/0_setup.r")
-library(disk.frame)
-#future::plan(multiprocess(workers = availableCores()))
-
-fmdf_all = disk.frame(file.path(outpath, "fmdf_appl"))
-
-head(fmdf_all)
-
-# how many delq.status are there ------------------------------------------
-# system.time(dscnt <- fmdf_all[,.N,delq.status, keep="delq.status"])
-#dscnt[order(delq.status),sum(N),delq.status %>% as.integer]
-
-system.time(dscnt2 <- fmdf_all %>% 
-              srckeep(c("delq.status", "default_12m", "harp_12m")) %>% 
-              group_by(delq.status, hard = F) %>% 
-              summarise(n = n(), ndef = sum(default_12m, na.rm=T), nharp = sum(harp_12m, na.rm=T)) %>% 
-              collect(parallel = T) %>% 
-              group_by(delq.status) %>% 
-              summarise(n = sum(n), ndef = sum(ndef), nharp = sum(nharp)) %>% 
-              arrange(delq.status))
-
-
-dscnt2[delq.status %in% c("X","","0","1","2"),.(delq.status, ndef/n)]
-
-# how many default and go to HARP -----------------------------------------
-# system.time(hd <- fmdf_all[,.N, .(harp_12m,default_12m), keep=c("harp_12m","default_12m")])
-# hd[,.(N = sum(N)), .(harp_12m, default_12m)]
-
-# 45 seconds
-system.time(hd <- fmdf_all %>% 
-  srckeep(c("harp_12m", "default_12m", "delq.status")) %>% 
-  filter(delq.status %in% c("X","","0","1","2")) %>% 
-  group_by(harp_12m, default_12m, hard = F) %>% 
-  summarise(n = n()) %>% 
-  collect(parallel = T) %>% 
-  group_by(harp_12m, default_12m) %>% 
-  summarise(n = sum(n)))
-
-# Source: local data table [4 x 3]
-# Groups: harp_12m
-# 
-# # A tibble: 4 x 3
-# harp_12m default_12m          n
-# <lgl>    <lgl>            <int>
-#   1 NA       NA          1765366501
-# 2 TRUE     NA            12343954
-# 3 NA       TRUE          22366382
-# 4 TRUE     TRUE               638
-
-# how many accounts eventually default ------------------------------------
-#future::plan(multiprocess(workers = availableCores()))
-pt = proc.time() # 4:26
-simple2 = fmdf_all %>%
-  srckeep(c("oltv", "default_12m", "harp_12m", "loan_id", "orig_dte", "delq.status")) %>%
-  #filter(delq.status %in% c("0","X","1","2")) %>% 
-  mutate(orig_yr = substr(orig_dte, 4,7)) %>% 
-  group_by(oltv, loan_id, orig_yr, hard = F) %>%
-  summarise(ndef = sum(default_12m, na.rm=T), n = n()) %>%
-  collect(parallel = T) %>%
-  group_by(oltv, loan_id, orig_yr) %>%
-  summarise(ndef = sum(ndef), n = sum(n))
-timetaken(pt)
-
-stopifnot(simple2[,n_distinct(loan_id) == .N])
-
-#simple3 = simple2[,.(ndef = sum(ndef > 0), .N), .(orig_yr_band = cut(as.numeric(orig_yr),c(-Inf, 2005, 2007, 2008,2009,Inf)), oltv_band = ceiling(oltv/10)*10)]
-
-simple3 = simple2[,.(ndef = sum(ndef > 0), .N), .(orig_yr, oltv_band = cut(oltv, c(-Inf, seq(0,80,by=20),Inf)))]
-
-simple3 %>% 
-  mutate(odr = ndef/N) %>% 
-  select(oltv_band, odr, orig_yr) %>% 
-  spread(key = oltv_band, value = odr)
-
-simple3 %>% 
-  filter(!is.na(oltv_band)) %>% 
-  mutate(odr = ndef/N) %>%
-  arrange(oltv_band) %>% 
-  ggplot +
-  geom_bar(
-    aes(
-      x = as.factor(oltv_band), 
-      weight = odr, 
-      colour = orig_yr),
-      position = "dodge")
-
-
-
-
-
-
-
diff --git a/tutorials/fannie_mae/4c_explore_xgboost.r b/tutorials/fannie_mae/4c_explore_xgboost.r
deleted file mode 100644
index f6a53cba..00000000
--- a/tutorials/fannie_mae/4c_explore_xgboost.r
+++ /dev/null
@@ -1,44 +0,0 @@
-source("inst/fannie_mae/0_setup.r")
-library(disk.frame)
-
-vars_to_keep = c(
-  Performance_Variables[1:15] %>% tolower,
-  Acquisitions_Variables[-1] %>% tolower,
-  c("default_12m", "harp_12m")
-) %>% unique
-
-# create a new variable dh12m which is a concatenation of default in next 12 months or going into harp next 12 months
-fmdf_all = disk.frame(file.path(outpath, "fmdf_appl")) %>% 
-  srckeep(vars_to_keep) %>%
-  delayed(~{
-    .x[,dh12m := default_12m | harp_12m]
-    .x[is.na(dh12m), dh12m:=F]
-    .x
-  })
-
-fmdf1 = disk.frame(file.path(outpath, "fmdf_appl"))
-
-# took 2 mins
-system.time(uid <- unique(fmdf1[substr(orig_dte ,4,7) >= "2014",unique(loan_id), keep=c("loan_id","orig_dte")]))
-
-# set.seed(1)
-# suid = sample(uid, length(uid)/100)
-
-uiddf = data.table(loan_id = uid)
-setkey(uiddf, loan_id)
-
-#fmdf_all = disk.frame(file.path(outpath, "fmdf_appl"))
-# took about 10 minutes
-system.time(fmdf_2yr <- fmdf_all %>%
-  map.disk.frame(~{
-    setkey(.x, loan_id)
-    merge(.x, uiddf, by="loan_id")
-  }, 
-  lazy = F, 
-  outdir = file.path(outpath,"fmdf_2yr"),
-  overwrite = T))
-
-# fmdf_2yr = disk.frame(file.path(outpath,"fmdf_2yr"))
-system.time(sfmdf_2yr <- sample_frac(fmdf_2yr, 1) %>% collect(parallel = F))
-
-
diff --git a/tutorials/fannie_mae/4d_xgboost.r b/tutorials/fannie_mae/4d_xgboost.r
deleted file mode 100644
index 248e676a..00000000
--- a/tutorials/fannie_mae/4d_xgboost.r
+++ /dev/null
@@ -1,151 +0,0 @@
-source("inst/fannie_mae/0_setup.r")
-library(disk.frame)
-fmdf_2yr = disk.frame(file.path(outpath,"fmdf_2yr"))
-
-system.time(xy <- fmdf_2yr[,.(dh12m, oltv),keep=c("dh12m","oltv")])
-
-
-# show how to binning -----------------------------------------------------
-system.time(xy <- fmdf_2yr[,c("dh12m", "oltv"), keep=c("dh12m","oltv")])
-
-dtrain <- xgb.DMatrix(label = xy$dh12m, data = as.matrix(xy[,"oltv"]))
-
-pt = proc.time()
-m <- xgboost(
-  data=dtrain, 
-  nrounds = 1, 
-  objective = "binary:logitraw", 
-  tree_method="exact",
-  monotone_constraints = 1,
-  base_score = 32662/10067411)
-timetaken(pt)
-
-
-map_chr(xgb.dump(m), ~str_extract(.x,"\\[f0<[\\d]+\\.[\\d]+\\]")) %>% 
-  keep(~!is.na(.x)) %>% 
-  map_dbl(~str_extract(.x, "[\\d]+\\.[\\d]+") %>% as.numeric) %>% 
-  sort %>% 
-  floor -> xy[,"oltv"]
-
-bb = xy[,sum(dh12m)/.N, .(bins = cut(oltv,c(-Inf,bins,Inf)))]
-
-setkey(bb, bins)
-
-barplot(height = bb$V1, names.arg = bb$bins)
-
-pt = proc.time()
-m2 <- xgboost(
-  data=x, 
-  label = y, 
-  nrounds = 3, 
-  objective = "binary:logitraw", 
-  tree_method="exact",
-  monotone_constraints = 1,
-  base_score = 32662/10067411)
-timetaken(pt)
-
-
-map_chr(xgb.dump(m2), ~str_extract(.x,"\\[f0<[\\d]+\\.[\\d]+\\]")) %>% 
-  keep(~!is.na(.x)) %>% 
-  map_dbl(~str_extract(.x, "[\\d]+\\.[\\d]+") %>% as.numeric) %>% 
-  sort %>% 
-  floor %>% 
-  unique -> bins
-
-bb = xy[,sum(dh12m)/.N, .(bins = cut(oltv,c(-Inf,bins,Inf)))]
-
-setkey(bb, bins)
-
-barplot(height = bb$V1, names.arg = bb$bins)
-
-print(length(bins))
-
-xgb.plot.tree(model = m)
-
-
-# with penalty ------------------------------------------------------------
-system.time(xy <- fmdf_2yr[,.(dh12m, oltv),keep=c("dh12m","oltv")])
-
-dtrain <- xgb.DMatrix(label = xy$dh12m, data = as.matrix(xy$oltv))
-
-system.time(xycv <- 
-              xgb.cv(dtrain, objective="binary:logitraw", nfold = 5, nrounds=1))
-xycv
-
-pt = proc.time()
-m <- xgboost(
-  data=dtrain, 
-  nrounds = 1, 
-  objective = "binary:logitraw", 
-  tree_method="exact",
-  monotone_constraints = 1,
-  base_score = 32662/10067411
-)
-timetaken(pt)
-
-
-map_chr(xgb.dump(m), ~str_extract(.x,"\\[f0<[\\d]+\\.[\\d]+\\]")) %>% 
-  keep(~!is.na(.x)) %>% 
-  map_dbl(~str_extract(.x, "[\\d]+\\.[\\d]+") %>% as.numeric) %>% 
-  sort %>% 
-  floor -> bins
-
-bb = xy[,sum(dh12m)/.N, .(bins = cut(oltv,c(-Inf,bins,Inf)))]
-
-setkey(bb, bins)
-
-barplot(height = bb$V1, names.arg = bb$bins)
-
-xgb.plot.tree(model = m)
-
-# do a variable with NA ---------------------------------------------------
-system.time(xy <- fmdf_2yr[,.(dh12m, mi_pct),keep=c("dh12m","mi_pct")])
-
-dtrain <- xgb.DMatrix(label = xy$dh12m, data = as.matrix(xy$mi_pct))
-
-pt = proc.time()
-m <- xgboost(
-  data= dtrain, 
-  nrounds = 1, 
-  objective = "binary:logitraw", 
-  tree_method="exact",
-  monotone_constraints = 1,
-  base_score = 32662/10067411)
-timetaken(pt)
-
-
-map_chr(xgb.dump(m), ~str_extract(.x,"\\[f0<[\\d]+\\.[\\d]+\\]")) %>% 
-  keep(~!is.na(.x)) %>% 
-  map_dbl(~str_extract(.x, "[\\d]+[\\.[\\d]+]") %>% as.numeric) %>% 
-  sort %>% 
-  floor %>% 
-  unique -> bins
-
-bb = xy[,sum(dh12m)/.N, .(bins = cut(mi_pct,c(-Inf,bins,Inf)) %>% addNA)]
-
-setkey(bb, bins)
-
-barplot(height = bb$V1, names.arg = bb$bins)
-
-
-# do other stuff ----------------------------------------------------------
-
-
-# took 142
-a = select_if(as.tibble(sfmdf_2yr), is.numeric) %>% 
-  select(-c(mi_type))
-
-system.time(x <- as.matrix(a))
-
-
-dtrain <- xgb.DMatrix(label = sfmdf_2yr$dh12m, data = x)
-
-
-xgboost(
-  data=x, 
-  label = sfmdf_2yr$dh12m, 
-  nrounds = 2, 
-  objective = "binary:logitraw", 
-  tree_method="exact")
-
-
diff --git a/tutorials/fannie_mae/5a_appl_model.r b/tutorials/fannie_mae/5a_appl_model.r
deleted file mode 100644
index 2c7d97e1..00000000
--- a/tutorials/fannie_mae/5a_appl_model.r
+++ /dev/null
@@ -1,53 +0,0 @@
-source("tutorials/fannie_mae/00_setup.r")
-library(disk.frame)
-
-defaults = disk.frame(file.path(outpath,"defaults.df")) %>%
-  srckeep(c("loan_id", "start_date")) %>% 
-  collect
-
-defaults[,first_default_date := start_date]
-month(defaults$first_default_date) <- month(defaults$first_default_date)+12
-
-defaults = defaults[,.(loan_id, first_default_date)]
-
-acqall = disk.frame(file.path(outpath,"acq_all.df"))
-
-# 1:14
-pt <- proc.time()
-acqall1 <- left_join(
-  acqall, 
-  defaults, 
-  by = "loan_id") %>%
-  delayed(~{
-    library(lubridate)
-    .x[
-      !is.na(first_default_date), 
-      mths_to_1st_default := interval(
-        as.Date(paste0("01/", frst_dte),"%d/%m/%Y"), 
-        first_default_date) 
-      %/% months(1)]
-    
-    .x[,default_next_12m := F]
-    .x[!is.na(first_default_date), default_next_12m := mths_to_1st_default <= 12]
-    
-    res = .x[substr(frst_dte,4,7) < 2016, ]
-    
-    res
-  }) %>% 
-  compute(outdir = file.path(outpath, "appl_mdl_data"), overwrite = T)
-timetaken(pt)
-
-# default rate by year of origination
-system.time(drbyyr <- acqall1 %>% 
-              srckeep(c("frst_dte", "default_next_12m")) %>% 
-              mutate(frst_yr = substr(frst_dte, 4, 7)) %>% 
-              group_by(frst_yr, hard = F) %>% 
-              summarise(ndef = sum(default_next_12m, na.rm=T), n = n()) %>% 
-              collect(parallel  = T) %>% 
-              group_by(frst_yr) %>% 
-              summarise(ndef = sum(ndef), n = sum(n)) %>% 
-              mutate(odr = ndef/n))
-
-drbyyr %>% 
-  ggplot +
-  geom_line(aes(x = frst_yr %>% as.numeric, y = odr))
diff --git a/tutorials/fannie_mae/5b_one_var.r b/tutorials/fannie_mae/5b_one_var.r
deleted file mode 100644
index f95f221d..00000000
--- a/tutorials/fannie_mae/5b_one_var.r
+++ /dev/null
@@ -1,189 +0,0 @@
-source("tutorials/fannie_mae/00_setup.r")
-library(disk.frame)
-
-acqall1 = disk.frame(file.path(outpath, "appl_mdl_data"))
-
-# XGBoost on one var ------------------------------------------------------
-system.time(xy <- acqall1[,c("default_next_12m", "oltv"), keep=c("default_next_12m", "oltv")])
-
-# transform into format accepted by XGBoost
-dtrain <- xgb.DMatrix(label = xy$default_next_12m, data = as.matrix(xy[,"oltv"]))
-
-portfolio_default_rate = xy[,sum(default_next_12m, na.rm = T)/.N]
-
-pt = proc.time()
-m <- xgboost(
-  data=dtrain, 
-  nrounds = 1, 
-  objective = "binary:logitraw", 
-  tree_method="exact",
-  monotone_constraints = 1,
-  base_score = portfolio_default_rate)
-timetaken(pt)
-
-prev_pred = predict(m, dtrain)
-
-map_chr(xgb.dump(m), ~str_extract(.x,"\\[f0<[\\d]+\\.[\\d]+\\]")) %>% 
-  keep(~!is.na(.x)) %>% 
-  map_dbl(~str_extract(.x, "[\\d]+\\.[\\d]+") %>% as.numeric) %>% 
-  sort %>% 
-  floor -> bins
-
-bb = xy[,.(ndef = sum(default_next_12m), .N, m1 = min(oltv), m2 = max(oltv)), .(bins = cut(oltv,c(-Inf,bins,Inf)))]
-new_bins = sort(unique( bb$m2))
-bb = xy[,.(ndef = sum(default_next_12m), .N), .(bins = cut(oltv,c(-Inf,new_bins,Inf)))][order(bins)]
-
-setkey(bb, bins)
-bb %>%
-  filter(!is.na(bins)) %>% 
-  mutate(`Orig LTV Band` = bins, `Default Rate%` = ndef/N) %>% 
-  ggplot +
-  geom_bar(aes(x = `Orig LTV Band`, y = `Default Rate%`), stat = 'identity') + 
-  coord_flip()
-
-if(F) {
-  system.time(xyz <- acqall1[,c("default_next_12m", "oltv", "frst_dte"), keep=c("default_next_12m", "oltv", "frst_dte")])
-  
-  xyz[,frst_yr := substr(frst_dte,4,7) %>% as.integer]
-  xyz[,frst_dte:=NULL]
-  
-  bb = xyz[,.(ndef = sum(default_next_12m), .N), .(bins = cut(oltv,c(-Inf,bins,Inf)), frst_yr)]
-  
-  bb[,binton := sum(N), bins]
-  bb[,dr := ndef/binton]
-  
-  setkey(bb, bins)
-  bb[order(frst_yr,decreasing = T),text_y_pos := cumsum(dr) - dr/2, bins]
-  bb[,text := substr(frst_yr, 3,4)]
-  
-  bb %>%
-    filter(!is.na(bins)) %>% 
-    mutate(`Yr of Orig` = as.factor(frst_yr), `Orig LTV Band` = bins, `Default Rate%` = dr) %>% 
-    ggplot +
-    geom_bar(aes(x = `Orig LTV Band`, y = `Default Rate%`, fill = `Yr of Orig`), stat = 'identity') + 
-    geom_text(aes(x = `Orig LTV Band`, y = text_y_pos, label = text)) +
-    coord_flip()
-}
-
-
-# what if we fit them in rounded number ---------------------------------------------------------
-
-xy[oltv >  80, oltv_round := ceiling(oltv/5)*5]
-xy[oltv <= 80, oltv_round := ceiling(oltv/10)*10]
-xy[oltv <= 40, oltv_round := ceiling(oltv/20)*20]
-#xy = xy[!is.na(oltv_round),]
-
-xy[is.na(oltv_round),]
-
-xy[,.N, oltv_round]
-
-# transform into format accepted by XGBoost
-dtrain <- xgb.DMatrix(label = xy$default_next_12m, data = as.matrix(xy[,"oltv_round"]))
-
-pt = proc.time()
-m <- xgboost(
-  data=dtrain, 
-  nrounds = 1, 
-  objective = "binary:logitraw", 
-  tree_method="exact",
-  monotone_constraints = 1,
-  base_score = portfolio_default_rate)
-timetaken(pt)
-
-map_chr(xgb.dump(m), ~str_extract(.x,"\\[f0<[\\d]+[\\.]{0,1}[\\d]+\\]")) %>% 
-  keep(~!is.na(.x)) %>% 
-  map_dbl(~str_extract(.x, "[\\d]+[\\.]{0,1}[\\d]+") %>% as.numeric) %>% 
-  sort -> bins
-
-bb = xy[,.(ndef = sum(default_next_12m), .N, m1 = min(oltv_round), m2 = max(oltv_round)), .(bins = cut(oltv_round,c(-Inf,bins,Inf)))]
-new_bins = sort(unique( bb$m2))
-bb = xy[,.(ndef = sum(default_next_12m), .N), .(bins = cut(oltv_round,c(-Inf,new_bins,Inf)))][order(bins)]
-bb[,odr := ndef/N]
-
-setkey(bb, bins)
-bb %>%
-  filter(!is.na(bins)) %>% 
-  mutate(`Orig LTV Band` = bins, `Default Rate%` = ndef/N) %>% 
-  ggplot +
-  geom_bar(aes(x = `Orig LTV Band`, y = `Default Rate%`), stat = 'identity') + 
-  coord_flip()
-
-
-prev_pred = predict(m, dtrain)
-
-prev_pred1 = predict(m, dtrain, predcontrib=T)
-
-# add another variable ----------------------------------------------------
-target = "default_next_12m"
-feature = "orig_amt"
-df = acqall1
-format_fn = base::I
-existing_model = prev_pred
-monotone_constraints = -1
-
-auc <- function(target, score) {
-  df = data.table(target, score)
-  
-  df1 = df[,.(nt = sum(target), n = .N, score)]
-  setkey(df1, score)
-}
-
-add_var_to_scorecard <- function(df, target, feature, monotone_constraints = 0, prev_pred = NULL, format_fn = base::I) {
-  
-  xy = df %>% 
-    srckeep(c(target, feature)) %>%
-    collect(parallel = T)
-  
-  # evaluate
-  code = glue::glue("xy = xy %>% mutate({feature} = format_fn({feature}))")
-  eval(parse(text = code))
-  
-  dtrain <- xgb.DMatrix(label = xy[,target, with = F][[1]], data = as.matrix(xy[,c(feature), with = F]))
-  
-  if(is.null(prev_pred)) {
-    pt = proc.time()
-    m2 <- xgboost(
-      data=dtrain, 
-      nrounds = 1, 
-      objective = "binary:logitraw", 
-      tree_method="exact",
-      monotone_constraints = monotone_constraints
-    )
-    timetaken(pt)
-  } else {
-    setinfo(dtrain, "base_margin", prev_pred)
-    pt = proc.time()
-    m2 <- xgboost(
-      data=dtrain, 
-      nrounds = 1, 
-      objective = "binary:logitraw", 
-      tree_method="exact",
-      monotone_constraints = monotone_constraints
-    )
-    timetaken(pt)
-    
-    a2 = predict(m2, dtrain)
-    a3 = predict(m2, dtrain, predcontrib = T)
-  }
-  
-  map_chr(xgb.dump(m2), ~str_extract(.x,"\\[f0<[\\d]+[\\.]{0,1}[\\d]+\\]")) %>% 
-    keep(~!is.na(.x)) %>% 
-    map_dbl(~str_extract(.x, "[\\d]+[\\.]{0,1}[\\d]+") %>% as.numeric) %>% 
-    sort -> bins
-  
-  code = glue::glue("bb = xy[,.(ndef = sum({target}), .N, m1 = min({feature}), m2 = max({feature})), .(bins = cut({feature},c(-Inf,bins,Inf)))]")
-  eval(parse(text = code))
-  
-  new_bins = sort(unique(bb$m2))
-  code1 = glue::glue("bb = xy[,.(ndef = sum(default_next_12m), .N), .(bins = cut({feature},c(-Inf,new_bins,Inf)))][order(bins)]")
-  eval(parse(text = code1))
-  
-  setkey(bb, bins)
-  bb %>%
-    filter(!is.na(bins)) %>% 
-    mutate(`Orig LTV Band` = bins, `Default Rate%` = ndef/N) %>% 
-    ggplot +
-    geom_bar(aes(x = `Orig LTV Band`, y = `Default Rate%`), stat = 'identity') + 
-    coord_flip()
-}
-
diff --git a/tutorials/fannie_mae/5c_using_a_fn.r b/tutorials/fannie_mae/5c_using_a_fn.r
deleted file mode 100644
index e083fa54..00000000
--- a/tutorials/fannie_mae/5c_using_a_fn.r
+++ /dev/null
@@ -1,54 +0,0 @@
-source("inst/fannie_mae/0_setup.r")
-library(disk.frame)
-
-acqall1 = disk.frame(file.path(outpath, "appl_mdl_data"))
-
-pt <- proc.time()
-firstm <- add_var_to_scorecard(acqall1, "default_next_12m", "oltv", monotone_constraints = 1, format_fn = function(v) {
-  ceiling(v / 5) * 5
-})
-timetaken(pt)
-
-# now add the second variable ---------------------------------------------
-pt <- proc.time()
-secondm <- add_var_to_scorecard(
-  acqall1, 
-  "default_next_12m", 
-  "dti", 
-  prev_pred = firstm$prev_pred, 
-  monotone_constraints = 1)
-timetaken(pt)
-
-pt <- proc.time()
-thirdm <- add_var_to_scorecard(
-  acqall1, 
-  "default_next_12m", 
-  "orig_trm", 
-  prev_pred = secondm$prev_pred, 
-  monotone_constraints = 1)
-timetaken(pt)
-
-pt <- proc.time()
-fourthm <- add_var_to_scorecard(
-  acqall1, 
-  "default_next_12m", 
-  "num_bo", 
-  prev_pred = thirdm$prev_pred, 
-  monotone_constraints = -1,
-  format_fn = as.numeric)
-timetaken(pt)
-
-pt <- proc.time()
-fifthm <- add_var_to_scorecard(
-  acqall1, 
-  "default_next_12m", 
-  "cscore_b", 
-  prev_pred = fourthm$prev_pred, 
-  monotone_constraints = -1)
-timetaken(pt)
-
-
-# plot improvement --------------------------------------------------------
-plot(map_dbl(list(firstm, secondm, thirdm, fourthm, fifthm), ~.x$auc), type="b", ylim = c(0.4,0.9))
-abline(h = 0.7, lty = 2)
-
diff --git a/tutorials/fannie_mae/5d_AutoML.r b/tutorials/fannie_mae/5d_AutoML.r
deleted file mode 100644
index 7525c879..00000000
--- a/tutorials/fannie_mae/5d_AutoML.r
+++ /dev/null
@@ -1,34 +0,0 @@
-source("inst/fannie_mae/0_setup.r")
-library(disk.frame)
-acqall = disk.frame(file.path(outpath, "appl_mdl_data"))
-
-pt = proc.time()
-acqall_all = acqall %>%
-  map.disk.frame(~{
-    defs = .x[default_next_12m == T,]
-    defs[,weight:=1]
-    non_defs = .x[default_next_12m == F, ]
-    non_defs[,weight:=10]
-    
-    rbindlist(list(defs, sample_frac(non_defs, 0.1)), fill = T, use.names = T)
-  }, lazy = F, outdir = file.path(outpath, "appl_mdl_data_sampled"), overwrite = T)
-timetaken(pt)
-
-#acqall_all = disk.frame(file.path(outpath, "appl_mdl_data_sampled"))
-
-acqall_dev = sample_frac(acqall_all, 0.7) %>% 
-  write_disk.frame(file.path(outpath, "appl_mdl_data_sampled_dev"))
-
-#acqall_dev = disk.frame(file.path(outpath, "appl_mdl_data_sampled_dev"))
-
-
-uid_dev = acqall_dev %>% 
-  srckeep("loan_id") %>% 
-  mutate(loan_id = unique(loan_id)) %>% 
-  collect(parallel = T) %>% 
-  mutate(loan_id = unique(loan_id))
-  
-
-acqall_val = acqall_all %>% 
-  anti_join(uid_dev, by = "loan_id") %>% 
-  write_disk.frame(file.path(outpath, "appl_mdl_data_sampled_val"))
diff --git a/tutorials/fannie_mae/5e_AutoML_actual.r b/tutorials/fannie_mae/5e_AutoML_actual.r
deleted file mode 100644
index 1e591543..00000000
--- a/tutorials/fannie_mae/5e_AutoML_actual.r
+++ /dev/null
@@ -1,90 +0,0 @@
-source("inst/fannie_mae/0_setup.r")
-library(disk.frame)
-
-acqall_dev = disk.frame(file.path(outpath, "appl_mdl_data_sampled_dev2"))
-  
-
-# 5d develop a function to test all variables
-check_which_is_best <- function(df, target, features, monotone_constraints, format_fns, weight=NULL) {
-  
-  prev_pred = NULL
-  ws = NULL
-  vars_scr = NULL
-  
-  while(length(features) > 0) {
-    # try every var
-    
-    res = furrr::future_map(1:length(features), ~add_var_to_scorecard(
-    #res = purrr::map(1:length(features), ~add_var_to_scorecard(
-      df, 
-      target,
-      features[.x], 
-      monotone_constraints = monotone_constraints[.x],
-      prev_pred = prev_pred,
-      format_fn = format_fns[[.x]],
-      weight,
-      save_model_fname = glue::glue("{features[.x]}.xgbm")
-    ))
-    
-    # which has the best auc
-    w = which.max(map_dbl(res, ~.x$auc))
-    ws = c(ws, w)
-    feature_to_fit = features[w]
-    print(glue::glue("choosen: {feature_to_fit}"))
-    
-    var_scr1 = res[[w]]
-    eval(parse(text = glue::glue("vars_scr = c(vars_scr, list({feature_to_fit} = var_scr1))")))
-    
-    features = features[-w]
-    format_fns = format_fns[-w]
-    monotone_constraints = monotone_constraints[-w]
-    
-    prev_pred = var_scr1$prev_pred
-  }
-  vars_scr
-}
-
-num_vars =     c("mi_pct", "orig_amt", "orig_rt", "ocltv", "mi_type", "cscore_c", "oltv", "dti", "orig_trm", "cscore_b", "num_bo", "num_unit", "orig_dte", "frst_dte")
-num_vars_mon = c(0       ,  1        , 1        , 1      , 0        ,  -1        , 1     , 1    ,     1     ,  -1        ,  -1     , 0         ,       0   ,  0)
-num_var_fmt_fn = c(map(1:10, ~base::I), map(1:2, ~as.numeric), map(1:2, ~function(x) {
-  as.numeric(substr(x, 4, 7))
-}))
-
-cat_vars = setdiff(names(acqall_dev), num_vars) %>% 
-  setdiff(
-    c("loan_id", "default_next_12m", "first_default_date", "mths_to_1st_default", "zip_3", "weight"))
-
-pt = proc.time()
-res_all = check_which_is_best(
-  acqall_dev, 
-  "default_next_12m", 
-  features = c(num_vars, cat_vars),
-  monotone_constraints = c(num_vars_mon, rep(0, length(cat_vars))),
-  format_fns = c(num_var_fmt_fn, map(1:length(cat_vars), ~base::I)),
-  weight = "weight"
-)
-timetaken(pt)
-
-plot(map_dbl(res_all, ~.x$auc))
-
-saveRDS(res_all, "model.rds")
-
-if(F) {
-  rescat = check_which_is_best(
-    acqall_dev, 
-    "default_next_12m", 
-    c("seller.name", "oltv", "state", "dti", "mi_type"),
-    c(0            , 1     , 0      , 1    , 0     ),
-    map(1:5, ~base::I)
-  )
-  
-  
-  system.time(res <- check_which_is_best(
-    acqall1, 
-    "default_next_12m", 
-    c("ocltv", "cscore_c", "mi_type", "oltv", "dti", "orig_trm", "cscore_b", "num_bo"),
-    c(1      , -1        , 0        , 1     , 1    ,     1     , -1        ,  -1     ),
-    c(map(1:7, ~base::I), list(as.numeric))))
-}
-
-
diff --git a/tutorials/fannie_mae/5f_make_scorecard_visible.R b/tutorials/fannie_mae/5f_make_scorecard_visible.R
deleted file mode 100644
index 2d696a29..00000000
--- a/tutorials/fannie_mae/5f_make_scorecard_visible.R
+++ /dev/null
@@ -1,94 +0,0 @@
-source("inst/fannie_mae/0_setup.r")
-library(disk.frame)
-
-acqall_val = disk.frame(file.path(outpath, "appl_mdl_data_sampled_val"))
-
-#the model
-mdl = readRDS("model.rds")
-
-plot(purrr::map_dbl(mdl, ~.x$auc))
-
-#' score one variable
-score_one_var <- function(feature, x, bins, bias, format_fn = base::I) {
-  print(feature)
-  # numeric features (post format_fn) has only two columns in bins
-  if(ncol(bins) == 2) {
-    x1 = format_fn(x)
-    binsNA = evalparseglue("bins[is.na({feature}),]")
-    bins = evalparseglue("bins[!is.na({feature}),]")
-    score = bins$score[cut(x1, c(-Inf, bins[[2]], Inf))]
-    
-    if(nrow(binsNA) == 1) {
-      score[is.na(x1)] = binsNA[[1]]
-    }
-  } else {
-    x1 = format_fn(x)
-    x1df = evalparseglue("data.table({feature} = x1)")
-    x1df_scr = evalparseglue("merge(x1df, bins, by = '{feature}', all.x=T)")
-    score = x1df_scr$score
-  }
-  # if there are still na then assign the bias
-  #browser(expr = any(is.na(score)))
-  if(any(is.na(score))) {
-    warning(glue::glue("the feature `{feature}` has {sum(is.na(score))} unassigned scores out of {length(score)}, assigning them the bias = {bias}"))
-    score[is.na(score)] = bias
-  }
-  
-  score
-}
-
-
-#' Socre a scorecard
-#' @param mdl the xg scorecard model
-#' @param df a disk.frame
-score_xg_scorecard <- function(df, mdl) {
-  res = furrr::future_map_dfc(mdl, ~{
-  #res = purrr::map_dfc(mdl, ~{
-    var = df %>% 
-      srckeep(.x$feature) %>% 
-      collect(parallel = F)
-    x = var[[1]]
-    feature = .x$feature
-    bins = .x$bins
-    bias = .x$bias
-    format_fn = .x$format_fn
-    score_one_var(feature, x, bins, bias, format_fn)
-  }) %>% rowSums
-  res
-}
-
-plot_auc <- function(df) {
-  target = df %>% 
-    srckeep("default_next_12m") %>% 
-    collect(parallel = F)
-  
-  pt = proc.time()
-  target$score = score_xg_scorecard(df, mdl)
-  timetaken(pt)
-  
-  AUC = auc(target$default_next_12m, target$score)
-  GINI = 2*AUC-1
-  
-  target[,negscore := -score]
-  setkey(target, negscore)
-  cts = target[,.(ctot = (1:.N)/.N, cbad = cumsum(default_next_12m)/sum(default_next_12m))]
-  
-  plot(cts[seq(1,.N, length.out=100),], type="l", xlim=c(0,1),
-       main=glue::glue("On Validation - AUC: {AUC %>% round(2)}; GINI: {GINI %>% round(2)}"))
-  abline(a=0,b=1,lty=2)
-  abline(h=0)
-  abline(v=0)
-}
-
-system.time(plot_auc(acqall_val))
-
-# score on whole ----------------------------------------------------------
-acqall_dev = disk.frame(file.path(outpath, "appl_mdl_data_sampled_dev"))
-system.time(plot_auc(acqall_dev))
-
-
-# score on whole ----------------------------------------------------------
-acqall = disk.frame(file.path(outpath, "appl_mdl_data"))
-system.time(plot_auc(acqall))
-
-
diff --git a/tutorials/fannie_mae/6a_origination_year.r b/tutorials/fannie_mae/6a_origination_year.r
deleted file mode 100644
index 8115a782..00000000
--- a/tutorials/fannie_mae/6a_origination_year.r
+++ /dev/null
@@ -1,31 +0,0 @@
-
-acqall2 = acqall1 %>%
-  srckeep(c("orig_dte", "first_default_date")) %>% 
-  mutate(orig_yr = substr(orig_dte, 4,7), yr_1st_d = year(first_default_date)) %>% 
-  group_by(orig_yr, yr_1st_d, hard = F) %>% 
-  summarise(n=n()) %>% 
-  collect %>% 
-  group_by(orig_yr, yr_1st_d) %>% 
-  summarise(n = sum(n))
-
-acqall2[,tot_n := sum(n), orig_yr]
-
-acqall3 <- acqall2[!is.na(yr_1st_d),]
-
-acqall3[,dr := n/tot_n]
-
-acqall3 %>% 
-  filter(orig_yr > 1999) %>% 
-  mutate(`Origination Year` = orig_yr) %>% 
-  ggplot +
-  geom_line(aes(x = yr_1st_d, y = dr, colour = `Origination Year`)) +
-  xlab("Year of Observation") +
-  ylab("Ratio of defaulted accounts vs # of accts at orig") +
-  scale_x_continuous(breaks=2000:2017, labels=as.character(2000:2017)) + 
-  scale_y_continuous(expand = c(0, 0)) + 
-  ggtitle("Fannie Mae Single Family Loans: Ratio of defaults vs # of accounts in same year of origination")
-
-
-
-
-
diff --git a/tutorials/fannie_mae/7a_speedglm.r b/tutorials/fannie_mae/7a_speedglm.r
deleted file mode 100644
index 974023bb..00000000
--- a/tutorials/fannie_mae/7a_speedglm.r
+++ /dev/null
@@ -1,75 +0,0 @@
-source("inst/fannie_mae/0_setup.r")
-library(disk.frame)
-
-acqall_dev = disk.frame(file.path(outpath, "appl_mdl_data_sampled_dev2"))
-
-library(speedglm)
-
-head(acqall_dev)
-
-#' A streaming function for speedglm
-#' @param df a disk.frame
-stream_shglm <- function(df) {
-  i = 0
-  is = sample(nchunks(df), replace = F)
-  function(reset = F) {
-    
-    if(reset) {
-      print("you've reset")
-      i <<- 0
-    } else {
-      i <<- i + 1
-      #if (i > 4) {
-      if (i > nchunks(df)) {
-        return(NULL)
-      }
-      print(glue::glue("{i}/{nchunks(df)}"))
-      return(get_chunk(df, is[i]))
-      #return(get_chunk(df, i))
-    }
-  }
-}
-
-acqall_dev1 = acqall_dev %>% delayed(~{
-  .x[,oltv_band := cut(oltv, c(-Inf, seq(0,100,by=10)))]
-  .x[,dti_band := addNA(cut(dti, c(-Inf, seq(0,100,by=10), Inf)))]
-  .x
-})
-
-library(speedglm)
-library(biglm)
-
-shglm(default_next_12m ~ oltv_band-1, 
-      datafun = streamacq, family=binomial())
-
-bigglm(default_next_12m ~ oltv_band-1, data = streamacq, family=binomial())
-
-make.data<-function(filename, chunksize,...){
-  conn<-NULL
-  function(reset=FALSE){
-    if(reset){
-      if(!is.null(conn)) 
-        close(conn)
-      conn<<-file(filename,open="r")} else{
-        rval<-read.table(conn, nrows=chunksize,...)
-        if ((nrow(rval)==0)) {
-          close(conn)
-          conn<<-NULL
-          rval<-NULL}
-        return(rval)}}}
-# data1 is a small toy dataset
-data(data1)
-write.table(data1,"data1.txt",row.names=FALSE,col.names=FALSE)
-rm(data1)
-da<-make.data("data1.txt",chunksize=50,col.names=c("y","fat1","x1","x2"))
-# Caution! make sure to close the connection once you have run command #1
-da(reset=T) #1: opens the connection to "data1.txt"
-da(reset=F) #2: reads the first 50 rows (out of 100) of the dataset
-da(reset=F) #3: reads the second 50 rows (out of 100) of the dataset
-da(reset=F) #4: is NULL: this latter command closes the connectionrequire(biglm)# fat1 is a factor with four levels
-
-b1<-shglm(y~factor(fat1)+x1,weights=~I(x2^2),datafun=da,family=Gamma(log))
-b2<-bigglm(y~factor(fat1)+x1,weights=~I(x2^2),data=da,family=Gamma(log))
-summary(b1)
-summary(b2)
-
diff --git a/tutorials/fannie_mae/8a_keras.r b/tutorials/fannie_mae/8a_keras.r
deleted file mode 100644
index ef2e1486..00000000
--- a/tutorials/fannie_mae/8a_keras.r
+++ /dev/null
@@ -1,153 +0,0 @@
-source("inst/fannie_mae/0_setup.r")
-library(disk.frame)
-library(keras)
-
-acqall_dev = disk.frame(file.path(outpath, "appl_mdl_data_sampled_dev"))
-
-#' A streaming function for speedglm
-#' @param df a disk.frame
-stream_shglm <- function(df) {
-  i = 0
-  is = sample(nchunks(df), replace = F)
-  function(reset = F) {
-    
-    if(reset) {
-      print("you've reset")
-      i <<- 0
-    } else {
-      i <<- i + 1
-      #if (i > 4) {
-      if (i > nchunks(df)) {
-        return(NULL)
-      }
-      print(glue::glue("{i}/{nchunks(df)}"))
-      return(get_chunk(df, is[i]))
-      #return(get_chunk(df, i))
-    }
-  }
-}
-
-acqall_dev1 = acqall_dev %>% delayed(~{
-  .x[,oltv_band := cut(oltv, c(-Inf, seq(40,100,by=20)))]
-  .x[,dti_band := addNA(cut(dti, c(-Inf, seq(0,64,by=8))))]
-  .x
-})
-
-
-build_model <- function() {
-  model <- keras_model_sequential() %>%
-    layer_dense(units = 2, activation = 'softmax')
-  
-  
-  model %>% compile(
-    loss = "categorical_crossentropy",
-    optimizer = 'sgd'
-  )
-  
-  model
-}
-
-model = build_model()
-
-ol = levels(acqall_dev1[,oltv_band, keep=c("oltv","dti")])
-dl = levels(acqall_dev1[,dti_band, keep=c("oltv","dti")])
-
-#for(i in 1:nchunks(acqall_dev1)) {
-kk <- function() {
-  j = 0
-  done = F
-  ii = 0
-  while(!done) {
-    j <- j + 1
-    si = sample(nchunks(acqall_dev1), nchunks(acqall_dev1)*0.7)
-    osi = setdiff(1:nchunks(acqall_dev1), si)
-    
-    system.time(a <- map_dfr(si, ~{
-      ii <- ii + 1
-      i = .x
-      if(ii %% 20 == 0) print(glue::glue("{j}:{ii} {Sys.time()}"))
-      
-      a = get_chunk(acqall_dev1, i)
-      a[,.(sum(default_next_12m), .N),oltv_band]
-      
-      a1 = 
-        cbind(
-          a[,keras::to_categorical(as.integer(oltv_band) - 1)]#,
-          #a[,keras::to_categorical(as.integer(dti_band) - 1)]
-        )
-      
-      at = a[,default_next_12m*1]
-      Y_train = keras::to_categorical(at)
-      
-      hist = model %>% fit(
-        a1,
-        Y_train,
-        epochs = 1,
-        validation_split = 0.2,
-        verbose = 0
-      )
-      
-      gw = get_weights(model)
-      
-      gwdt = as.data.table(gw[1])
-      setnames(gwdt, names(gwdt), c("non_default", "default"))
-      gwdt$i =i
-      gwdt$var = c(
-        rep("oltv_band", length(ol))
-        #, rep("dti_band", length(dl)+1)
-      )
-      gwdt$band = c(ol#,dl
-      )
-      gwdt
-    }))
-    
-    some_chunks <- map(osi, ~{
-      i = .x
-      a = get_chunk(acqall_dev1, i)
-      
-      a1 = 
-        cbind(
-          a[,keras::to_categorical(as.integer(oltv_band) -1)]
-          #,a[,keras::to_categorical(dti_band %>% as.integer-1)]
-        )
-      a1
-    }) %>% reduce(rbind)
-    
-    outcomes <- map(osi, ~{
-      i = .x
-      a = get_chunk(acqall_dev, i, keep=c("default_next_12m"))
-      a[,default_next_12m]
-    }) %>% unlist
-    
-    auc= auc(outcomes, predict(model, some_chunks)[,2])
-    
-    # scores
-    p = predict(model, diag(4))[,2]
-    a_b = log(p/(1-p))
-    
-    scalar = 20/log(2)
-    intercept = 20*log(536870912/15)/log(2)
-    
-    a = mean(a_b)
-    b = a_b - a
-    done <- length(unique(sign(diff(b)))) == 1
-    
-    scrs = data.table(base_scr = round(a*scalar+intercept,0), scr = round(-b*scalar,0))
-    scrs$var = c(rep("oltv_band", length(ol))
-                 #, rep("dti_band", length(dl)+1)
-    )
-    scrs$band = c(ol
-                  #, dl
-    )
-    scrs = scrs[band != "bias"]
-    
-    scrs = scrs[,.(var, band, scr, base_scr)]
-    print(glue::glue("AUC: {auc}"))
-    print(scrs)
-  }  
-}
-
-pt = proc.time()
-kk()
-timetaken(pt)
-View(scrs)
diff --git a/tutorials/fannie_mae_10pct/00_setup.r b/tutorials/fannie_mae_10pct/00_setup.r
deleted file mode 100644
index 23df1718..00000000
--- a/tutorials/fannie_mae_10pct/00_setup.r
+++ /dev/null
@@ -1,44 +0,0 @@
-library(glue)
-library(purrr)
-library(fst)
-library(tidyr)
-library(ggplot2)
-library(stringr)
-library(xgboost)
-library(lubridate)
-library(disk.frame)
-
-auc = disk.frame:::auc
-add_var_to_scorecard = disk.frame:::add_var_to_scorecard
-
-raw_perf_data_path = "C:/data/Performance_All/"
-#raw_perf_data_path = "d:/data/Performance_All"
-
-# where the outputs go
-outpath = "c:/data/fannie_mae_disk_frame/"
-#outpath = "d:/data/fannie_mae_disk_frame_small/"
-
-Performance_ColClasses = 
-  c("character", "character", "character", "numeric", "numeric", "numeric", "numeric", 
-    "numeric", "character", "character", "character", "character", "character", "character", 
-    "character", "character", "character", "numeric", "numeric", "numeric", "numeric", "numeric", 
-    "numeric", "numeric", "numeric", "numeric", "numeric", "numeric", "character", "numeric", 
-    "character")
-Performance_Variables = 
-  c("LOAN_ID", "Monthly.Rpt.Prd", "Servicer.Name", "LAST_RT", "LAST_UPB", "Loan.Age", 
-    "Months.To.Legal.Mat" , "Adj.Month.To.Mat", "Maturity.Date", "MSA", "Delq.Status", 
-    "MOD_FLAG", "Zero.Bal.Code", "ZB_DTE", "LPI_DTE", "FCC_DTE","DISP_DT", "FCC_COST", 
-    "PP_COST", "AR_COST", "IE_COST", "TAX_COST", "NS_PROCS", "CE_PROCS", "RMW_PROCS", 
-    "O_PROCS", "NON_INT_UPB", "PRIN_FORG_UPB_FHFA", "REPCH_FLAG", "PRIN_FORG_UPB_OTH", 
-    "TRANSFER_FLG") %>% tolower
-
-dfiles = dir(raw_perf_data_path, full.names = T)
-short_dfiles = dir(raw_perf_data_path)
-
-Acquisitions_Variables = c("LOAN_ID", "ORIG_CHN", "Seller.Name", "ORIG_RT", "ORIG_AMT", "ORIG_TRM", "ORIG_DTE"
-                           ,"FRST_DTE", "OLTV", "OCLTV", "NUM_BO", "DTI", "CSCORE_B", "FTHB_FLG", "PURPOSE", "PROP_TYP"
-                           ,"NUM_UNIT", "OCC_STAT", "STATE", "ZIP_3", "MI_PCT", "Product.Type", "CSCORE_C", "MI_TYPE", "RELOCATION_FLG") %>% tolower()
-
-Acquisition_ColClasses = c("character", "character", "character", "numeric", "numeric", "integer", "character", "character", "numeric",
-                           "numeric", "character", "numeric", "numeric", "character", "character", "character", "character", "character",
-                           "character", "character", "numeric", "character", "numeric", "numeric", "character")
diff --git a/tutorials/fannie_mae_10pct/01a_read_from_csv.r b/tutorials/fannie_mae_10pct/01a_read_from_csv.r
deleted file mode 100644
index abfe3062..00000000
--- a/tutorials/fannie_mae_10pct/01a_read_from_csv.r
+++ /dev/null
@@ -1,47 +0,0 @@
-source("inst/fannie_mae_10pct/00_setup.r")
-
-# number of rows to read in from each file in one go
-nreadin = 1e7
-
-# compression ratio, max = 100 for best compression but slower running speed
-#compress = 50
-
-# set up some variable for future use
-relative_file_path = dir(raw_perf_data_path)
-full_file_path = dir(raw_perf_data_path, full.names = T)
-
-file_sizes = purrr::map_dbl(full_file_path, ~file.size(.x))
-
-# use the recommend_nchunks function to get a chunksize based on your RAM and
-# number of CPU cores
-nchunks = sum(file_sizes) %>% recommend_nchunks(type="csv")
-
-# order the order of conversion by prioritising the largest files first. Because
-# handling the largest files are the most difficult, and if an error occurs it
-# is more likely to occur when converting large files, hence this will allow us
-# to identify errors early
-relative_file_path = relative_file_path[order(file_sizes, decreasing = T)]
-full_file_path = full_file_path[order(file_sizes, decreasing = T)]
-
-l = length(full_file_path)
-
-# convert CSV in parallel
-pt <- proc.time()
-res = future.apply::future_mapply(function(relative_file_pathi, full_file_pathi) {
-#res = mapply(function(relative_file_pathi, full_file_pathi) {
-  relative_file_pathi # for glue
-  df = csv_to_disk.frame(
-    full_file_pathi, 
-    file.path(outpath, glue("raw_fannie_mae/{relative_file_pathi}")), 
-    shardby = "loan_id", 
-    nchunks = nchunks, 
-    colClasses = Performance_ColClasses, 
-    col.names = Performance_Variables, 
-    sep = "|", 
-    #compress = compress, 
-    in_chunk_size = nreadin, 
-    overwrite = T)
-  
-  df
-}, relative_file_path, full_file_path, SIMPLIFY = F)
-print(timetaken(pt))
diff --git a/tutorials/fannie_mae_10pct/01d_a_rbind_all_data_together.r b/tutorials/fannie_mae_10pct/01d_a_rbind_all_data_together.r
deleted file mode 100644
index 78ccf3e1..00000000
--- a/tutorials/fannie_mae_10pct/01d_a_rbind_all_data_together.r
+++ /dev/null
@@ -1,31 +0,0 @@
-source("inst/fannie_mae_10pct/00_setup.r")
-
-disk.frame_folders = dir(file.path(outpath, "raw_fannie_mae"),full.names = T)
-list_of_disk.frames <- purrr::map(disk.frame_folders, disk.frame)
-
-pt <- proc.time()
-fmdf = rbindlist.disk.frame(
-  list_of_disk.frames, 
-  outdir = file.path(outpath, "fm.df"), 
-  by_chunk_id = T,
-  overwrite = T)
-print(timetaken(pt))
-
-nrow(fmdf)
-
-nchunks(fmdf)
-
-fmdf %>% delayed(~.x[,.(sum(is.na(prin_forg_upb_oth )))]
-) %>% dput
-
-
-if(F) {
-  #239.44 
-  system.time(uid <- fmdf %>% 
-                srckeep("loan_id") %>% 
-                summarise(uid = unique(loan_id)) %>% 
-                collect)
-  # the sharding is correct
-  
-  n_distinct(uid$uid)
-}
diff --git a/tutorials/fannie_mae_10pct/01d_b_OPTIONAL_rechunk.r b/tutorials/fannie_mae_10pct/01d_b_OPTIONAL_rechunk.r
deleted file mode 100644
index 0a0739fc..00000000
--- a/tutorials/fannie_mae_10pct/01d_b_OPTIONAL_rechunk.r
+++ /dev/null
@@ -1,16 +0,0 @@
-library(disk.frame)
-
-fmdf <- disk.frame(file.path(outpath, "fm.df"))
-
-# rechunk respects the shardkey
-system.time(rechunk(fmdf, nchunks(fmdf)*2))
-
-nrow(fmdf)
-
-nchunks(fmdf)
-
-
-if(F) {
-  system.time(rechunk(fmdf, 6))
-}
-
diff --git a/tutorials/fannie_mae_10pct/02a_create_forward_looking_flag.r b/tutorials/fannie_mae_10pct/02a_create_forward_looking_flag.r
deleted file mode 100644
index 041b3334..00000000
--- a/tutorials/fannie_mae_10pct/02a_create_forward_looking_flag.r
+++ /dev/null
@@ -1,65 +0,0 @@
-# 2_exploratory.r
-source("inst/fannie_mae_10pct/00_setup.r")
-
-fmdf = disk.frame(file.path(outpath, "fm.df"))
-
-pt = proc.time()
-eg = fmdf %>% 
-  srckeep(c("loan_id", "delq.status")) %>% 
-  group_by(loan_id) %>% 
-  summarise(n = n(), max.delq.status = max(as.integer(delq.status))) %>% 
-  filter(n >= 12, max.delq.status == 3) %>% 
-  delayed(~.x[1,]) %>% 
-  collect
-timetaken(pt)
-
-pt = proc.time()
-fmdf_eg = fmdf %>% 
-  srckeep(c("monthly.rpt.prd", "loan_id", "delq.status")) %>% 
-  inner_join(eg, by = "loan_id") %>% 
-  collect %>%
-  mutate(monthly.rpt.prd = as.Date(monthly.rpt.prd, "%m/%d/%Y")) %>% 
-  arrange(loan_id, monthly.rpt.prd)
-timetaken(pt)
-
-View(fmdf_eg) # see 2nd account
-
-# no need to hard group by it's already sharded by loan_id
-pt <- proc.time()
-defaults <- map.disk.frame(fmdf, function(df) {
-  # create the default flag
-  df[,date:=as.Date(monthly.rpt.prd,"%m/%d/%Y")]
-  setkey(df,loan_id, date)
-  df[,delq.status := as.integer(delq.status)]
-  df[,default:=delq.status>=3]
-  df[is.na(default),default:=F]
-  
-  df2 = df[default==T,.(loan_id, start_date =date, end_date = date)]
-  lubridate::month(df2$start_date) <- lubridate::month(df2$start_date)-12
-  
-  # get rid of overlaps
-  df2[order(start_date),lag_end_date := shift(end_date,1), loan_id]
-  
-  df2[,grp_incr := 1]
-  df2[start_date <= lag_end_date, grp_incr := 0, loan_id]
-  
-  df2[,grp:=cumsum(grp_incr)]
-  df3 = df2[,.(start_date = min(start_date), end_date = max(end_date), default_12m=T),.(loan_id,grp)]
-  
-  df3[,grp:=NULL]
-  
-  # create the hardship flag
-  # whether the customer goes into in the next 12 months hardship 
-  df3
-  }, 
-  keep = c("monthly.rpt.prd", "delq.status", "loan_id"), 
-  outdir = file.path(outpath, "defaults.df"),
-  lazy = F, 
-  overwrite = T)
-cat(glue::glue("Creating forward looking default flag took {timetaken(pt)}"))
-
-head(defaults)
-
-defaults %>% 
-  filter(loan_id == "107150255179") %>% 
-  collect
diff --git a/tutorials/fannie_mae_10pct/02a_create_forward_looking_flag_4credit_risk_forum.r b/tutorials/fannie_mae_10pct/02a_create_forward_looking_flag_4credit_risk_forum.r
deleted file mode 100644
index e5320e1f..00000000
--- a/tutorials/fannie_mae_10pct/02a_create_forward_looking_flag_4credit_risk_forum.r
+++ /dev/null
@@ -1,47 +0,0 @@
-# 2_exploratory.r
-source("inst/fannie_mae/0_setup.r")
-outpath
-fmdf = disk.frame(file.path(outpath, "fm_with_harp"))
-
-nrow(fmdf)
-
-
-# no need to hard group by it's already sharded by loan_id
-pt <- proc.time()
-defaults <- map.disk.frame(fmdf, function(df) {
-  # create the default flag
-  df[,date:=as.Date(monthly.rpt.prd,"%m/%d/%Y")]
-  setkey(df,loan_id, date)
-  df[,delq.status := as.integer(delq.status)]
-  df[,default:=delq.status>=3]
-  df[is.na(default),default:=F]
-  
-  df2 = df[default==T,.(loan_id, start_date =date, end_date = date)]
-  lubridate::month(df2$start_date) <- lubridate::month(df2$start_date)-12
-  
-  # get rid of overlaps
-  df2[order(start_date),lag_end_date := shift(end_date,1), loan_id]
-  
-  df2[,grp_incr := 1]
-  df2[start_date <= lag_end_date, grp_incr := 0, loan_id]
-  
-  df2[,grp:=cumsum(grp_incr)]
-  df3 = df2[,.(start_date = min(start_date), end_date = max(end_date), default_12m=T),.(loan_id,grp)]
-  
-  df3[,grp:=NULL]
-  
-  # create the hardship flag
-  # whether the customer goes into in the next 12 months hardship 
-  df3
-  }, 
-  keep = c("monthly.rpt.prd", "delq.status", "loan_id"), 
-  outdir = file.path(outpath, "defaults.df"),
-  lazy = F, 
-  overwrite = T)
-cat(glue::glue("Creating forward looking default flag took {timetaken(pt)}"))
-
-head(defaults)
-
-defaults %>% 
-  filter(loan_id == "107150255179") %>% 
-  collect
diff --git a/tutorials/fannie_mae_10pct/02b_foverlaps.r b/tutorials/fannie_mae_10pct/02b_foverlaps.r
deleted file mode 100644
index d89094de..00000000
--- a/tutorials/fannie_mae_10pct/02b_foverlaps.r
+++ /dev/null
@@ -1,29 +0,0 @@
-# 2_exploratory.r
-source("inst/fannie_mae_10pct/00_setup.r")
-
-fmdf = disk.frame(file.path(outpath, "fm.df"))
-tmp2 = disk.frame(file.path(outpath, "defaults.df"))
-
-fmdf_lazy = delayed(fmdf, function(df) {
-  df[,monthly.rpt.prd:=as.Date(monthly.rpt.prd,"%m/%d/%Y")]
-  df[,monthly.rpt.prd2:=monthly.rpt.prd]
-})
-
-tmp2_lazy = delayed(tmp2, function(df) {
-  setkey(df, "loan_id","start_date","end_date")
-})
-
-# took 513 seconds
-pt <- proc.time()
-tmp3 <- foverlaps.disk.frame(
-  fmdf_lazy, 
-  tmp2_lazy, 
-  outdir=file.path(outpath, "fm_with_harp"), 
-  by.x = c("loan_id", "monthly.rpt.prd", "monthly.rpt.prd2"),
-  by.y = c("loan_id", "start_date", "end_date"),
-  merge_by_chunk_id = T,
-  overwrite = T
-)
-cat(glue::glue("time taken to merge on default flag {timetaken(pt)}"))
-
-head(tmp3)
diff --git a/tutorials/fannie_mae_10pct/02c_plot_odr.r b/tutorials/fannie_mae_10pct/02c_plot_odr.r
deleted file mode 100644
index fc8308ab..00000000
--- a/tutorials/fannie_mae_10pct/02c_plot_odr.r
+++ /dev/null
@@ -1,34 +0,0 @@
-# 2_exploratory.r
-source("inst/fannie_mae/0_setup.r")
-
-fm_with_harp = disk.frame(file.path(outpath, "fm_with_harp"))
-
-head(fm_with_harp)
-
-nrow(fm_with_harp)
-
-# need a two stage summary
-system.time(a_wh1 <- fm_with_harp %>% 
-  srckeep(c("default_12m","monthly.rpt.prd")) %>% 
-  group_by(monthly.rpt.prd) %>% 
-  summarise(
-    N = n(), 
-    n_defaults = sum(default_12m, na.rm = T)) %>% 
-  collect(parallel  = T) %>%
-  group_by(monthly.rpt.prd) %>% 
-  summarise(
-    odr = sum(n_defaults)/sum(N)
-  ) %>% 
-  rename(
-    Date = monthly.rpt.prd,
-    `Observed Default Rate%` = odr
-  ))
-  
-
-a_wh2 = a_wh1 %>% gather(key = type, value=rate, -Date)
-
-ggplot(a_wh2) + 
-  geom_line(aes(x=Date, y = rate, colour = type)) +
-  ggtitle("Fannie Mae Observed Default Rate over time & HARP Conversion Rate")
-
-
diff --git a/tutorials/fannie_mae_10pct/10_a_one_var.R b/tutorials/fannie_mae_10pct/10_a_one_var.R
deleted file mode 100644
index e5b3cf03..00000000
--- a/tutorials/fannie_mae_10pct/10_a_one_var.R
+++ /dev/null
@@ -1,55 +0,0 @@
-source("inst/fannie_mae_10pct/00_setup.r")
-library(disk.frame)
-
-acqall1 = disk.frame(file.path(outpath, "appl_mdl_data"))
-
-# XGBoost on one var ------------------------------------------------------
-system.time(xy <- acqall1[,c("default_next_12m", "oltv"), keep=c("default_next_12m", "oltv")])
-
-# transform into format accepted by XGBoost
-dtrain <- xgb.DMatrix(label = xy$default_next_12m, data = as.matrix(xy[,"oltv"]))
-
-portfolio_default_rate = xy[,sum(default_next_12m, na.rm = T)/.N]
-
-
-# xgboost fit -------------------------------------------------------------
-
-
-###################################################
-# the xgboost fit
-###################################################
-pt = proc.time()
-m <- xgboost(
-  data=dtrain, 
-  nrounds = 1, 
-  objective = "binary:logitraw", 
-  tree_method="exact",
-  monotone_constraints = 1,
-  base_score = portfolio_default_rate)
-timetaken(pt)
-
-# plotting ----------------------------------------------------------------
-
-
-###################################################
-# plotting
-###################################################
-prev_pred = predict(m, dtrain)
-
-map_chr(xgb.dump(m), ~str_extract(.x,"\\[f0<[\\d]+\\.[\\d]+\\]")) %>% 
-  keep(~!is.na(.x)) %>% 
-  map_dbl(~str_extract(.x, "[\\d]+\\.[\\d]+") %>% as.numeric) %>% 
-  sort %>% 
-  floor -> bins
-
-bb = xy[,.(ndef = sum(default_next_12m), .N, m1 = min(oltv), m2 = max(oltv)), .(bins = cut(oltv,c(-Inf,bins,Inf)))]
-new_bins = sort(unique( bb$m2))
-bb = xy[,.(ndef = sum(default_next_12m), .N), .(bins = cut(oltv,c(-Inf,new_bins,Inf)))][order(bins)]
-
-setkey(bb, bins)
-bb %>%
-  filter(!is.na(bins)) %>% 
-  mutate(`Orig LTV Band` = bins, `Default Rate%` = ndef/N) %>% 
-  ggplot +
-  geom_bar(aes(x = `Orig LTV Band`, y = `Default Rate%`), stat = 'identity') + 
-  coord_flip()
diff --git a/tutorials/fannie_mae_10pct/10_a_one_var_with_fn.R b/tutorials/fannie_mae_10pct/10_a_one_var_with_fn.R
deleted file mode 100644
index 266fb107..00000000
--- a/tutorials/fannie_mae_10pct/10_a_one_var_with_fn.R
+++ /dev/null
@@ -1,56 +0,0 @@
-source("inst/fannie_mae/0_setup.r")
-library(disk.frame)
-
-acqall1 = disk.frame(file.path(outpath, "appl_mdl_data_sampled_dev2"))
-
-pt <- proc.time()
-firstm <- disk.frame:::add_var_to_scorecard(acqall1, "default_next_12m", "oltv", monotone_constraints = 1, format_fn = function(v) {
-  ceiling(v / 5) * 5
-})
-timetaken(pt)
-
-firstm
-
-# now add the second variable ---------------------------------------------
-pt <- proc.time()
-secondm <- disk.frame:::add_var_to_scorecard(
-  acqall1, 
-  "default_next_12m", 
-  "dti", 
-  prev_pred = firstm$prev_pred, 
-  monotone_constraints = 1)
-timetaken(pt)
-
-pt <- proc.time()
-thirdm <- disk.frame:::add_var_to_scorecard(
-  acqall1, 
-  "default_next_12m", 
-  "orig_trm", 
-  prev_pred = secondm$prev_pred, 
-  monotone_constraints = 1)
-timetaken(pt)
-
-pt <- proc.time()
-fourthm <- disk.frame:::add_var_to_scorecard(
-  acqall1, 
-  "default_next_12m", 
-  "num_bo", 
-  prev_pred = thirdm$prev_pred, 
-  monotone_constraints = -1,
-  format_fn = as.numeric)
-timetaken(pt)
-
-pt <- proc.time()
-fifthm <- disk.frame:::add_var_to_scorecard(
-  acqall1, 
-  "default_next_12m", 
-  "cscore_b", 
-  prev_pred = fourthm$prev_pred, 
-  monotone_constraints = -1)
-timetaken(pt)
-
-
-# plot improvement --------------------------------------------------------
-plot(map_dbl(list(firstm, secondm, thirdm, fourthm, fifthm), ~.x$auc), type="b", ylim = c(0.4,0.9))
-abline(h = 0.7, lty = 2)
-
diff --git a/tutorials/fannie_mae_10pct/10_b_rounded.R b/tutorials/fannie_mae_10pct/10_b_rounded.R
deleted file mode 100644
index 73c1893a..00000000
--- a/tutorials/fannie_mae_10pct/10_b_rounded.R
+++ /dev/null
@@ -1,123 +0,0 @@
-source("inst/fannie_mae_10pct/0_setup.r")
-# what if we fit them in rounded number ---------------------------------------------------------
-
-xy[oltv >  80, oltv_round := ceiling(oltv/5)*5]
-xy[oltv <= 80, oltv_round := ceiling(oltv/10)*10]
-xy[oltv <= 40, oltv_round := ceiling(oltv/20)*20]
-#xy = xy[!is.na(oltv_round),]
-
-xy[is.na(oltv_round),]
-
-xy[,.N, oltv_round]
-
-# transform into format accepted by XGBoost
-dtrain <- xgb.DMatrix(label = xy$default_next_12m, data = as.matrix(xy[,"oltv_round"]))
-
-pt = proc.time()
-m <- xgboost(
-  data=dtrain, 
-  nrounds = 1, 
-  objective = "binary:logitraw", 
-  tree_method="exact",
-  monotone_constraints = 1,
-  base_score = portfolio_default_rate)
-timetaken(pt)
-
-map_chr(xgb.dump(m), ~str_extract(.x,"\\[f0<[\\d]+[\\.]{0,1}[\\d]+\\]")) %>% 
-  keep(~!is.na(.x)) %>% 
-  map_dbl(~str_extract(.x, "[\\d]+[\\.]{0,1}[\\d]+") %>% as.numeric) %>% 
-  sort -> bins
-
-bb = xy[,.(ndef = sum(default_next_12m), .N, m1 = min(oltv_round), m2 = max(oltv_round)), .(bins = cut(oltv_round,c(-Inf,bins,Inf)))]
-new_bins = sort(unique( bb$m2))
-bb = xy[,.(ndef = sum(default_next_12m), .N), .(bins = cut(oltv_round,c(-Inf,new_bins,Inf)))][order(bins)]
-bb[,odr := ndef/N]
-
-setkey(bb, bins)
-bb %>%
-  filter(!is.na(bins)) %>% 
-  mutate(`Orig LTV Band` = bins, `Default Rate%` = ndef/N) %>% 
-  ggplot +
-  geom_bar(aes(x = `Orig LTV Band`, y = `Default Rate%`), stat = 'identity') + 
-  coord_flip()
-
-
-prev_pred = predict(m, dtrain)
-
-prev_pred1 = predict(m, dtrain, predcontrib=T)
-
-# add another variable ----------------------------------------------------
-target = "default_next_12m"
-feature = "orig_amt"
-df = acqall1
-format_fn = base::I
-existing_model = prev_pred
-monotone_constraints = -1
-
-# auc <- function(target, score) {
-#   df = data.table(target, score)
-#   
-#   df1 = df[,.(nt = sum(target), n = .N, score)]
-#   setkey(df1, score)
-# }
-# 
-# add_var_to_scorecard <- function(df, target, feature, monotone_constraints = 0, prev_pred = NULL, format_fn = base::I) {
-#   
-#   xy = df %>% 
-#     srckeep(c(target, feature)) %>%
-#     collect(parallel = T)
-#   
-#   # evaluate
-#   code = glue::glue("xy = xy %>% mutate({feature} = format_fn({feature}))")
-#   eval(parse(text = code))
-#   
-#   dtrain <- xgb.DMatrix(label = xy[,target, with = F][[1]], data = as.matrix(xy[,c(feature), with = F]))
-#   
-#   if(is.null(prev_pred)) {
-#     pt = proc.time()
-#     m2 <- xgboost(
-#       data=dtrain, 
-#       nrounds = 1, 
-#       objective = "binary:logitraw", 
-#       tree_method="exact",
-#       monotone_constraints = monotone_constraints
-#     )
-#     timetaken(pt)
-#   } else {
-#     setinfo(dtrain, "base_margin", prev_pred)
-#     pt = proc.time()
-#     m2 <- xgboost(
-#       data=dtrain, 
-#       nrounds = 1, 
-#       objective = "binary:logitraw", 
-#       tree_method="exact",
-#       monotone_constraints = monotone_constraints
-#     )
-#     timetaken(pt)
-#     
-#     a2 = predict(m2, dtrain)
-#     a3 = predict(m2, dtrain, predcontrib = T)
-#   }
-#   
-#   map_chr(xgb.dump(m2), ~str_extract(.x,"\\[f0<[\\d]+[\\.]{0,1}[\\d]+\\]")) %>% 
-#     keep(~!is.na(.x)) %>% 
-#     map_dbl(~str_extract(.x, "[\\d]+[\\.]{0,1}[\\d]+") %>% as.numeric) %>% 
-#     sort -> bins
-#   
-#   code = glue::glue("bb = xy[,.(ndef = sum({target}), .N, m1 = min({feature}), m2 = max({feature})), .(bins = cut({feature},c(-Inf,bins,Inf)))]")
-#   eval(parse(text = code))
-#   
-#   new_bins = sort(unique(bb$m2))
-#   code1 = glue::glue("bb = xy[,.(ndef = sum(default_next_12m), .N), .(bins = cut({feature},c(-Inf,new_bins,Inf)))][order(bins)]")
-#   eval(parse(text = code1))
-#   
-#   setkey(bb, bins)
-#   bb %>%
-#     filter(!is.na(bins)) %>% 
-#     mutate(`Orig LTV Band` = bins, `Default Rate%` = ndef/N) %>% 
-#     ggplot +
-#     geom_bar(aes(x = `Orig LTV Band`, y = `Default Rate%`), stat = 'identity') + 
-#     coord_flip()
-#   
-#   
-# }
\ No newline at end of file
diff --git a/tutorials/fannie_mae_10pct/10_b_two_vars.r b/tutorials/fannie_mae_10pct/10_b_two_vars.r
deleted file mode 100644
index d0df6cd4..00000000
--- a/tutorials/fannie_mae_10pct/10_b_two_vars.r
+++ /dev/null
@@ -1,61 +0,0 @@
-source("inst/fannie_mae/0_setup.r")
-library(disk.frame)
-
-acqall1 = disk.frame(file.path(outpath, "appl_mdl_data_sampled_dev2"))
-
-pt <- proc.time()
-firstm <- add_var_to_scorecard(
-  acqall1, 
-  "default_next_12m", 
-  "oltv", 
-  monotone_constraints = 1, 
-  format_fn = function(v) {
-    ceiling(v / 5) * 5
-  })
-timetaken(pt)
-
-firstm
-
-# now add the second variable ---------------------------------------------
-pt <- proc.time()
-secondm <- add_var_to_scorecard(
-  acqall1, 
-  "default_next_12m", 
-  "dti", 
-  prev_pred = firstm$prev_pred, 
-  monotone_constraints = 1)
-timetaken(pt)
-
-pt <- proc.time()
-thirdm <- add_var_to_scorecard(
-  acqall1, 
-  "default_next_12m", 
-  "orig_trm", 
-  prev_pred = secondm$prev_pred, 
-  monotone_constraints = 1)
-timetaken(pt)
-
-pt <- proc.time()
-fourthm <- add_var_to_scorecard(
-  acqall1, 
-  "default_next_12m", 
-  "num_bo", 
-  prev_pred = thirdm$prev_pred, 
-  monotone_constraints = -1,
-  format_fn = as.numeric)
-timetaken(pt)
-
-pt <- proc.time()
-fifthm <- add_var_to_scorecard(
-  acqall1, 
-  "default_next_12m", 
-  "cscore_b", 
-  prev_pred = fourthm$prev_pred, 
-  monotone_constraints = -1)
-timetaken(pt)
-
-
-# plot improvement --------------------------------------------------------
-#plot(map_dbl(list(firstm, secondm, thirdm, fourthm, fifthm), ~.x$auc), type="b", ylim = c(0.4,0.9))
-#abline(h = 0.7, lty = 2)
-
diff --git a/tutorials/fannie_mae_10pct/10_c_two_vars.R b/tutorials/fannie_mae_10pct/10_c_two_vars.R
deleted file mode 100644
index 3124d577..00000000
--- a/tutorials/fannie_mae_10pct/10_c_two_vars.R
+++ /dev/null
@@ -1,64 +0,0 @@
-source("inst/fannie_mae_10pct/0_setup.r")
-library(disk.frame)
-
-acqall1 = disk.frame(file.path(outpath, "appl_mdl_data"))
-
-pt <- proc.time()
-firstm <- add_var_to_scorecard(
-  acqall1, 
-  "default_next_12m", 
-  "oltv", 
-  monotone_constraints = 1, 
-  format_fn = function(v) {
-    ceiling(v / 5) * 5
-  })
-timetaken(pt)
-
-firstm
-
-# now add the second variable ---------------------------------------------
-pt <- proc.time()
-secondm <- add_var_to_scorecard(
-  acqall1, 
-  "default_next_12m", 
-  "dti", 
-  prev_pred = firstm$prev_pred, 
-  monotone_constraints = 1)
-timetaken(pt)
-secondm
-
-pt <- proc.time()
-thirdm <- add_var_to_scorecard(
-  acqall1, 
-  "default_next_12m", 
-  "orig_trm", 
-  prev_pred = secondm$prev_pred, 
-  monotone_constraints = 1)
-timetaken(pt)
-
-pt <- proc.time()
-fourthm <- add_var_to_scorecard(
-  acqall1, 
-  "default_next_12m", 
-  "num_bo", 
-  prev_pred = thirdm$prev_pred, 
-  monotone_constraints = -1,
-  format_fn = as.numeric)
-timetaken(pt)
-
-pt <- proc.time()
-fifthm <- add_var_to_scorecard(
-  acqall1, 
-  "default_next_12m", 
-  "cscore_b", 
-  prev_pred = fourthm$prev_pred, 
-  monotone_constraints = -1)
-timetaken(pt)
-
-
-# plot improvement --------------------------------------------------------
-plot(
-  map_dbl(
-    list(firstm, secondm, thirdm, fourthm, fifthm), ~.x$auc), type="b", ylim = c(0.4,0.9), main="AUC vs number of variables",ylab="AUC",xlab="Number of Variables")
-abline(h = 0.7, lty = 2)
-
diff --git a/tutorials/fannie_mae_10pct/10_e_AutoML_XGBoost_Scorecards.r b/tutorials/fannie_mae_10pct/10_e_AutoML_XGBoost_Scorecards.r
deleted file mode 100644
index 66eb27b3..00000000
--- a/tutorials/fannie_mae_10pct/10_e_AutoML_XGBoost_Scorecards.r
+++ /dev/null
@@ -1,91 +0,0 @@
-source("inst/fannie_mae_10pct/00_setup.r")
-library(disk.frame)
-library(xgboost)
-acqall_dev = disk.frame(file.path(outpath, "appl_mdl_data_sampled"))
-  
-add_var_to_scorecard = disk.frame:::add_var_to_scorecard
-
-# 5d develop a function to test all variables
-check_which_is_best <- function(df, target, features, monotone_constraints, format_fns, weight=NULL) {
-  
-  prev_pred = NULL
-  ws = NULL
-  vars_scr = NULL
-  
-  while(length(features) > 0) {
-    # try every var
-    
-    res = furrr::future_map(1:length(features), ~add_var_to_scorecard(
-    #res = purrr::map(1:length(features), ~add_var_to_scorecard(
-      df, 
-      target,
-      features[.x], 
-      monotone_constraints = monotone_constraints[.x],
-      prev_pred = prev_pred,
-      format_fn = format_fns[[.x]],
-      weight,
-      save_model_fname = glue::glue("{features[.x]}.xgbm")
-    ))
-    
-    # which has the best auc
-    w = which.max(map_dbl(res, ~.x$auc))
-    ws = c(ws, w)
-    feature_to_fit = features[w]
-    print(glue::glue("choosen: {feature_to_fit}"))
-    
-    var_scr1 = res[[w]]
-    eval(parse(text = glue::glue("vars_scr = c(vars_scr, list({feature_to_fit} = var_scr1))")))
-    
-    features = features[-w]
-    format_fns = format_fns[-w]
-    monotone_constraints = monotone_constraints[-w]
-    
-    prev_pred = var_scr1$prev_pred
-  }
-  vars_scr
-}
-
-num_vars =     c("mi_pct", "orig_amt", "orig_rt", "ocltv", "mi_type", "cscore_c", "oltv", "dti", "orig_trm", "cscore_b", "num_bo", "num_unit", "orig_dte", "frst_dte")
-num_vars_mon = c(0       ,  1        , 1        , 1      , 0        ,  -1        , 1     , 1    ,     1     ,  -1        ,  -1     , 0         ,       0   ,  0)
-num_var_fmt_fn = c(map(1:10, ~base::I), map(1:2, ~as.numeric), map(1:2, ~function(x) {
-  as.numeric(substr(x, 4, 7))
-}))
-
-cat_vars = setdiff(names(acqall_dev), num_vars) %>% 
-  setdiff(
-    c("loan_id", "default_next_12m", "first_default_date", "mths_to_1st_default", "zip_3", "weight"))
-
-pt = proc.time()
-res_all = check_which_is_best(
-  acqall_dev, 
-  "default_next_12m", 
-  features = c(num_vars, cat_vars),
-  monotone_constraints = c(num_vars_mon, rep(0, length(cat_vars))),
-  format_fns = c(num_var_fmt_fn, map(1:length(cat_vars), ~base::I)),
-  weight = "weight"
-)
-timetaken(pt)
-
-plot(map_dbl(res_all, ~.x$auc))
-
-#saveRDS(res_all, "model.rds")
-
-if(F) {
-  rescat = check_which_is_best(
-    acqall_dev, 
-    "default_next_12m", 
-    c("seller.name", "oltv", "state", "dti", "mi_type"),
-    c(0            , 1     , 0      , 1    , 0     ),
-    map(1:5, ~base::I)
-  )
-  
-  
-  system.time(res <- check_which_is_best(
-    acqall1, 
-    "default_next_12m", 
-    c("ocltv", "cscore_c", "mi_type", "oltv", "dti", "orig_trm", "cscore_b", "num_bo"),
-    c(1      , -1        , 0        , 1     , 1    ,     1     , -1        ,  -1     ),
-    c(map(1:7, ~base::I), list(as.numeric))))
-}
-
-
diff --git a/tutorials/fannie_mae_10pct/10_f_make_scorecard_visible.R b/tutorials/fannie_mae_10pct/10_f_make_scorecard_visible.R
deleted file mode 100644
index 36fb832d..00000000
--- a/tutorials/fannie_mae_10pct/10_f_make_scorecard_visible.R
+++ /dev/null
@@ -1,109 +0,0 @@
-source("inst/fannie_mae/0_setup.r")
-library(disk.frame)
-
-acqall_val = disk.frame(file.path(outpath, "appl_mdl_data_sampled_val2"))
-df = acqall_val
-#the model
-mdl = readRDS("model.rds")
-
-plot(purrr::map_dbl(mdl, ~.x$auc))
-
-#' score one variable
-score_one_var <- function(feature, x, bins, bias, format_fn = base::I) {
-  print(feature)
-  # numeric features (post format_fn) has only two columns in bins
-  if(ncol(bins) == 2) {
-    x1 = format_fn(x)
-    binsNA = evalparseglue("bins[is.na({feature}),]")
-    bins = evalparseglue("bins[!is.na({feature}),]")
-    score = bins$score[cut(x1, c(-Inf, bins[[2]], Inf))]
-    
-    if(nrow(binsNA) == 1) {
-      score[is.na(x1)] = binsNA[[1]]
-    }
-  } else {
-    x1 = format_fn(x)
-    x1df = evalparseglue("data.table({feature} = x1)")
-    x1df_scr = evalparseglue("merge(x1df, bins, by = '{feature}', all.x=T)")
-    score = x1df_scr$score
-  }
-  # if there are still na then assign the bias
-  #browser(expr = any(is.na(score)))
-  if(any(is.na(score))) {
-    warning(glue::glue("the feature `{feature}` has {sum(is.na(score))} unassigned scores out of {length(score)}, assigning them the bias = {bias}"))
-    score[is.na(score)] = bias
-  }
-  
-  score
-}
-
-
-#' Socre a scorecard
-#' @param mdl the xg scorecard model
-#' @param df a disk.frame
-score_xg_scorecard <- function(df, mdl) {
-  res = furrr::future_map_dfc(mdl, ~{
-  #res = purrr::map_dfc(mdl, ~{
-    var = df %>% 
-      srckeep(.x$feature) %>% 
-      collect(parallel = F)
-    x = var[[1]]
-    feature = .x$feature
-    bins = .x$bins
-    bias = .x$bias
-    format_fn = .x$format_fn
-    score_one_var(feature, x, bins, bias, format_fn)
-  }) %>% rowSums
-  res
-}
-
-plot_auc <- function(df) {
-  target = df %>% 
-    srckeep("default_next_12m") %>% 
-    collect(parallel = F)
-  
-  pt = proc.time()
-  target$score = score_xg_scorecard(df, mdl)
-  timetaken(pt)
-  
-  AUC = auc(target$default_next_12m, target$score)
-  GINI = 2*AUC-1
-  
-  target[,negscore := -score]
-  setkey(target, negscore)
-  cts = target[,.(ctot = (1:.N)/.N, cbad = cumsum(default_next_12m)/sum(default_next_12m))]
-  
-  plot(cts[seq(1,.N, length.out=100),], type="l", xlim=c(0,1),
-       main=glue::glue("On Validation - AUC: {AUC %>% round(2)}; GINI: {GINI %>% round(2)}"))
-  abline(a=0,b=1,lty=2)
-  abline(h=0)
-  abline(v=0)
-}
-
-
-system.time(plot_auc(acqall_val))
-
-# score on whole ----------------------------------------------------------
-acqall_dev = disk.frame(file.path(outpath, "appl_mdl_data_sampled_dev2"))
-system.time(plot_auc(acqall_dev))
-
-scorecard = map_dfr(mdl, ~{
-  res = .x$bins
-  evalparseglue("res[,feature_lbl := as.character({.x$feature})]")
-  res[,variable := .x$feature]
-  res %>% 
-    select(variable, feature_lbl, score ) %>% 
-    mutate(score = round(-score*20/log(2)))
-})
-
-saveRDS(scorecard, "scorecard.rds")
-scorecard = readRDS("scorecard.rds")
-View(scorecard)
-
-DT::datatable(scorecard)
-
-# score on whole ----------------------------------------------------------
-# acqall = disk.frame(file.path(outpath, "appl_mdl_data"))
-# system.time(plot_auc(acqall))
-
-
diff --git a/tutorials/fannie_mae_10pct/10a_AutoML_XGBoost_Scorecards.r b/tutorials/fannie_mae_10pct/10a_AutoML_XGBoost_Scorecards.r
deleted file mode 100644
index 6df43da8..00000000
--- a/tutorials/fannie_mae_10pct/10a_AutoML_XGBoost_Scorecards.r
+++ /dev/null
@@ -1,90 +0,0 @@
-source("inst/fannie_mae_10pct/0_setup.r")
-library(disk.frame)
-
-acqall_dev = disk.frame(file.path(outpath, "appl_mdl_data_sampled"))
-  
-
-# 5d develop a function to test all variables
-check_which_is_best <- function(df, target, features, monotone_constraints, format_fns, weight=NULL) {
-  
-  prev_pred = NULL
-  ws = NULL
-  vars_scr = NULL
-  
-  while(length(features) > 0) {
-    # try every var
-    
-    res = furrr::future_map(1:length(features), ~add_var_to_scorecard(
-    #res = purrr::map(1:length(features), ~add_var_to_scorecard(
-      df, 
-      target,
-      features[.x], 
-      monotone_constraints = monotone_constraints[.x],
-      prev_pred = prev_pred,
-      format_fn = format_fns[[.x]],
-      weight,
-      save_model_fname = glue::glue("{features[.x]}.xgbm")
-    ))
-    
-    # which has the best auc
-    w = which.max(map_dbl(res, ~.x$auc))
-    ws = c(ws, w)
-    feature_to_fit = features[w]
-    print(glue::glue("choosen: {feature_to_fit}"))
-    
-    var_scr1 = res[[w]]
-    eval(parse(text = glue::glue("vars_scr = c(vars_scr, list({feature_to_fit} = var_scr1))")))
-    
-    features = features[-w]
-    format_fns = format_fns[-w]
-    monotone_constraints = monotone_constraints[-w]
-    
-    prev_pred = var_scr1$prev_pred
-  }
-  vars_scr
-}
-
-num_vars =     c("mi_pct", "orig_amt", "orig_rt", "ocltv", "mi_type", "cscore_c", "oltv", "dti", "orig_trm", "cscore_b", "num_bo", "num_unit", "orig_dte", "frst_dte")
-num_vars_mon = c(0       ,  1        , 1        , 1      , 0        ,  -1        , 1     , 1    ,     1     ,  -1        ,  -1     , 0         ,       0   ,  0)
-num_var_fmt_fn = c(map(1:10, ~base::I), map(1:2, ~as.numeric), map(1:2, ~function(x) {
-  as.numeric(substr(x, 4, 7))
-}))
-
-cat_vars = setdiff(names(acqall_dev), num_vars) %>% 
-  setdiff(
-    c("loan_id", "default_next_12m", "first_default_date", "mths_to_1st_default", "zip_3", "weight"))
-
-pt = proc.time()
-res_all = check_which_is_best(
-  acqall_dev, 
-  "default_next_12m", 
-  features = c(num_vars, cat_vars),
-  monotone_constraints = c(num_vars_mon, rep(0, length(cat_vars))),
-  format_fns = c(num_var_fmt_fn, map(1:length(cat_vars), ~base::I)),
-  weight = "weight"
-)
-timetaken(pt)
-
-plot(map_dbl(res_all, ~.x$auc))
-
-#saveRDS(res_all, "model.rds")
-
-if(F) {
-  rescat = check_which_is_best(
-    acqall_dev, 
-    "default_next_12m", 
-    c("seller.name", "oltv", "state", "dti", "mi_type"),
-    c(0            , 1     , 0      , 1    , 0     ),
-    map(1:5, ~base::I)
-  )
-  
-  
-  system.time(res <- check_which_is_best(
-    acqall1, 
-    "default_next_12m", 
-    c("ocltv", "cscore_c", "mi_type", "oltv", "dti", "orig_trm", "cscore_b", "num_bo"),
-    c(1      , -1        , 0        , 1     , 1    ,     1     , -1        ,  -1     ),
-    c(map(1:7, ~base::I), list(as.numeric))))
-}
-
-
diff --git a/tutorials/fannie_mae_10pct/10b_make_scorecard_visible.R b/tutorials/fannie_mae_10pct/10b_make_scorecard_visible.R
deleted file mode 100644
index 36fb832d..00000000
--- a/tutorials/fannie_mae_10pct/10b_make_scorecard_visible.R
+++ /dev/null
@@ -1,109 +0,0 @@
-source("inst/fannie_mae/0_setup.r")
-library(disk.frame)
-
-acqall_val = disk.frame(file.path(outpath, "appl_mdl_data_sampled_val2"))
-df = acqall_val
-#the model
-mdl = readRDS("model.rds")
-
-plot(purrr::map_dbl(mdl, ~.x$auc))
-
-#' score one variable
-score_one_var <- function(feature, x, bins, bias, format_fn = base::I) {
-  print(feature)
-  # numeric features (post format_fn) has only two columns in bins
-  if(ncol(bins) == 2) {
-    x1 = format_fn(x)
-    binsNA = evalparseglue("bins[is.na({feature}),]")
-    bins = evalparseglue("bins[!is.na({feature}),]")
-    score = bins$score[cut(x1, c(-Inf, bins[[2]], Inf))]
-    
-    if(nrow(binsNA) == 1) {
-      score[is.na(x1)] = binsNA[[1]]
-    }
-  } else {
-    x1 = format_fn(x)
-    x1df = evalparseglue("data.table({feature} = x1)")
-    x1df_scr = evalparseglue("merge(x1df, bins, by = '{feature}', all.x=T)")
-    score = x1df_scr$score
-  }
-  # if there are still na then assign the bias
-  #browser(expr = any(is.na(score)))
-  if(any(is.na(score))) {
-    warning(glue::glue("the feature `{feature}` has {sum(is.na(score))} unassigned scores out of {length(score)}, assigning them the bias = {bias}"))
-    score[is.na(score)] = bias
-  }
-  
-  score
-}
-
-
-#' Socre a scorecard
-#' @param mdl the xg scorecard model
-#' @param df a disk.frame
-score_xg_scorecard <- function(df, mdl) {
-  res = furrr::future_map_dfc(mdl, ~{
-  #res = purrr::map_dfc(mdl, ~{
-    var = df %>% 
-      srckeep(.x$feature) %>% 
-      collect(parallel = F)
-    x = var[[1]]
-    feature = .x$feature
-    bins = .x$bins
-    bias = .x$bias
-    format_fn = .x$format_fn
-    score_one_var(feature, x, bins, bias, format_fn)
-  }) %>% rowSums
-  res
-}
-
-plot_auc <- function(df) {
-  target = df %>% 
-    srckeep("default_next_12m") %>% 
-    collect(parallel = F)
-  
-  pt = proc.time()
-  target$score = score_xg_scorecard(df, mdl)
-  timetaken(pt)
-  
-  AUC = auc(target$default_next_12m, target$score)
-  GINI = 2*AUC-1
-  
-  target[,negscore := -score]
-  setkey(target, negscore)
-  cts = target[,.(ctot = (1:.N)/.N, cbad = cumsum(default_next_12m)/sum(default_next_12m))]
-  
-  plot(cts[seq(1,.N, length.out=100),], type="l", xlim=c(0,1),
-       main=glue::glue("On Validation - AUC: {AUC %>% round(2)}; GINI: {GINI %>% round(2)}"))
-  abline(a=0,b=1,lty=2)
-  abline(h=0)
-  abline(v=0)
-}
-
-
-system.time(plot_auc(acqall_val))
-
-# score on whole ----------------------------------------------------------
-acqall_dev = disk.frame(file.path(outpath, "appl_mdl_data_sampled_dev2"))
-system.time(plot_auc(acqall_dev))
-
-scorecard = map_dfr(mdl, ~{
-  res = .x$bins
-  evalparseglue("res[,feature_lbl := as.character({.x$feature})]")
-  res[,variable := .x$feature]
-  res %>% 
-    select(variable, feature_lbl, score ) %>% 
-    mutate(score = round(-score*20/log(2)))
-})
-
-saveRDS(scorecard, "scorecard.rds")
-scorecard = readRDS("scorecard.rds")
-View(scorecard)
-
-DT::datatable(scorecard)
-
-# score on whole ----------------------------------------------------------
-# acqall = disk.frame(file.path(outpath, "appl_mdl_data"))
-# system.time(plot_auc(acqall))
-
-
diff --git a/tutorials/fannie_mae_10pct/11_speedglm.R b/tutorials/fannie_mae_10pct/11_speedglm.R
deleted file mode 100644
index 33b1323d..00000000
--- a/tutorials/fannie_mae_10pct/11_speedglm.R
+++ /dev/null
@@ -1,50 +0,0 @@
-source("inst/fannie_mae/0_setup.r")
-library(disk.frame)
-library(speedglm)
-library(biglm)
-
-acqall_dev = disk.frame(file.path(outpath, "appl_mdl_data"))
-library(speedglm)
-
-head(acqall_dev)
-
-#' A streaming function for speedglm
-#' @param df a disk.frame
-stream_shglm <- function(df) {
-  i = 0
-  is = sample(nchunks(df), replace = F)
-  function(reset = F) {
-    
-    if(reset) {
-      print("reset")
-      i <<- 0
-    } else {
-      i <<- i + 1
-      #if (i > 3) {
-      if (i > nchunks(df)) {
-        return(NULL)
-      }
-      print(glue::glue("streaming: {i}/{nchunks(df)}"))
-      return(get_chunk(df, is[i]))
-      #return(get_chunk(df, i))
-    }
-  }
-}
-
-acqall_dev1 = acqall_dev %>% delayed(~{
-  .x[,oltv_band := cut(oltv, c(-Inf, seq(60,100,by=20)))]
-  .x[,dti_band := addNA(cut(dti, c(-Inf, c(20,100), Inf)))]
-  .x
-})
-
-streamacq = stream_shglm(acqall_dev1)
-
-m = bigglm(
-  default_next_12m ~ oltv_band-1, 
-  data = streamacq, 
-  family=binomial())
-summary(m)
-
-shglm(default_next_12m ~ oltv_band -1 #+ dti_band + - 1
-      , 
-      datafun = streamacq, family=binomial())
\ No newline at end of file
diff --git a/tutorials/fannie_mae_10pct/12_keras.r b/tutorials/fannie_mae_10pct/12_keras.r
deleted file mode 100644
index e43f2012..00000000
--- a/tutorials/fannie_mae_10pct/12_keras.r
+++ /dev/null
@@ -1,178 +0,0 @@
-source("inst/fannie_mae_10pct/0_setup.r")
-library(disk.frame)
-library(keras)
-
-acqall_dev = disk.frame(file.path(outpath, "appl_mdl_data_sampled"))
-
-#' A streaming function for speedglm
-#' @param df a disk.frame
-stream_shglm <- function(df) {
-  i = 0
-  is = sample(nchunks(df), replace = F)
-  function(reset = F) {
-    
-    if(reset) {
-      print("you've reset")
-      i <<- 0
-    } else {
-      i <<- i + 1
-      #if (i > 4) {
-      if (i > nchunks(df)) {
-        return(NULL)
-      }
-      print(glue::glue("{i}/{nchunks(df)}"))
-      return(get_chunk(df, is[i]))
-      #return(get_chunk(df, i))
-    }
-  }
-}
-
-acqall_dev1 = acqall_dev %>% delayed(~{
-  .x[,oltv_band := cut(oltv, c(-Inf, 60,80, Inf))]
-  #.x[,scr_band := addNA(cut(cscore_b, c(-Inf, 627,700, Inf)), ifany=F)]
-  .x[,scr_band := addNA(cut(cscore_b, c(-Inf, 700,716,725,742,748,766,794, Inf)), ifany=F)]
-  
-  .x
-})
-
-if(F) {
-  aa = acqall_dev1 %>% collect
-  glm(default_next_12m ~ oltv_band + scr_band - 1, data=aa)
-}
-
-head(acqall_dev1)
-
-build_model <- function() {
-  model <- keras_model_sequential() %>%
-    layer_dense(units = 2, activation = 'softmax')
-  
-  
-  model %>% compile(
-    loss = "categorical_crossentropy",
-    optimizer = 'sgd'
-  )
-  
-  model
-}
-
-model = build_model()
-
-#ol = levels(acqall_dev1[,oltv_band, keep=c("oltv","cscore_b")])
-#dl = levels(acqall_dev1[,scr_band, keep=c("oltv","cscore_b")])
-
-ol = levels(get_chunk(acqall_dev1,1)[,oltv_band])
-dl = levels(get_chunk(acqall_dev1,1)[,scr_band])
-
-
-#for(i in 1:nchunks(acqall_dev1)) {
-kk <- function() {
-  
-  j = 0
-  done = F
-  ii = 0
-  while(!done) {
-    j <- j + 1
-    si = sample(nchunks(acqall_dev1), nchunks(acqall_dev1)*0.3)
-    osi = setdiff(1:nchunks(acqall_dev1), si)
-    
-    system.time(a <- map_dfr(si, ~{
-      
-      ii <- ii + 1
-      i = .x
-      if(ii %% 20 == 0) print(glue::glue("{j}:{ii} {Sys.time()}"))
-      
-      a = get_chunk(acqall_dev1, i)
-      #a[,.(sum(default_next_12m), .N),oltv_band]
-      
-      a1 = 
-        cbind(
-          a[,keras::to_categorical(as.integer(oltv_band) - 1)]
-          #,a[,keras::to_categorical(as.integer(scr_band) - 1)]
-        )
-      
-      at = a[,default_next_12m*1]
-      Y_train = keras::to_categorical(at)
-      
-      hist = model %>% fit(
-        a1,
-        Y_train,
-        epochs = 1,
-        validation_split = 0.2,
-        verbose = 0
-      )
-      
-      gw = get_weights(model)
-      
-      gwdt = as.data.table(gw[1])
-      setnames(gwdt, names(gwdt), c("non_default", "default"))
-      gwdt$i =i
-      gwdt$var = c(
-        rep("oltv_band", length(ol))
-        #, rep("scr_band", length(dl))
-      )
-      gwdt$band = c(ol,dl
-      )
-      gwdt
-    }))
-    
-    some_chunks <- map(osi, ~{
-      i = .x
-      a = get_chunk(acqall_dev1, i)
-      
-      a1 = 
-        cbind(
-          a[,keras::to_categorical(as.integer(oltv_band) -1)]
-          #,a[,keras::to_categorical(as.integer(scr_band)-1)]
-        )
-      a1
-    }) %>% reduce(rbind)
-    
-    outcomes <- map(osi, ~{
-      i = .x
-      a = get_chunk(acqall_dev, i, keep=c("default_next_12m"))
-      a[,default_next_12m]
-    }) %>% unlist
-    
-    auc = auc(outcomes, predict(model, some_chunks)[,2])
-    
-    # scores
-    p = predict(model, diag(
-      length(ol)
-      #+length(dl)
-      ))[,2]
-    a_b = log(p/(1-p))
-    
-    scalar = 20/log(2)
-    intercept = 20*log(536870912/15)/log(2)
-    
-    a = mean(a_b)
-    b = a_b - a
-    done <- length(unique(sign(diff(b)))) == 1
-    
-    scrs = data.table(base_scr = round(a*scalar+intercept,0), scr = round(-b*scalar,0))
-    scrs$var = c(
-      rep("oltv_band", length(ol))
-      #, rep("scr_band", length(dl))
-    )
-    scrs$band = c(ol
-                  , dl
-    )
-    scrs = scrs[band != "bias"]
-    
-    scrs = scrs[,.(var, band, scr, base_scr)]
-    print(glue::glue("AUC: {auc}"))
-    print(scrs)
-  }  
-  scrs
-}
-
-pt = proc.time()
-scrs = kk()
-timetaken(pt)
-View(scrs)
-
-#AUC: 0.599790504924901
-# var      band scr base_scr
-# 1: oltv_band (-Inf,60]  10      351
-# 2: oltv_band   (60,80]  10      351
-# 3: oltv_band (80, Inf] -21      351
\ No newline at end of file
diff --git a/tutorials/fannie_mae_new/00_setup.r b/tutorials/fannie_mae_new/00_setup.r
deleted file mode 100644
index 5bb675ec..00000000
--- a/tutorials/fannie_mae_new/00_setup.r
+++ /dev/null
@@ -1,66 +0,0 @@
-library(glue)
-library(purrr)
-library(fst)
-library(tidyr)
-library(ggplot2)
-library(stringr)
-library(xgboost)
-library(lubridate)
-library(future.apply)
-library(data.table)
-library(disk.frame)
-
-setup_disk.frame()
-
-
-raw_harp_data_path = "c:/data/fannie_mae/harp_files"
-#raw_harp_data_path = "D:/data/fannie_mae/harp_files"
-
-#raw_perf_data_path = "c:/data/Performance_All/"
-
-## read the split data set is ALOT (6x) faster
-raw_perf_data_path = "c:/data/Performance_All_split/"
-#raw_perf_data_path = "d:/data/Performance_All/"
-
-
-raw_harp_data_path = "c:/data/fannie_mae/harp_files"
-#raw_harp_data_path = "D:/data/fannie_mae/harp_files"
-
-# level of compression from 1 to 100 where 100 is the highest level
-# compress = 50
-
-# where the outputs go
-#outpath = "c:/data/fannie_mae_disk_frame/"
-outpath = "c:/data/fannie_mae_disk_frame/"
-
-#acqzip_file_path = "c:/data/Acquisition_All.zip"
-acqzip_file_path = "c:/data/fannie_mae/Acquisition_All.zip"
-
-#appl_path = "C:/data/"
-appl_path = "c:/data"
-
-Performance_ColClasses = 
-  c("character", "character", "character", "numeric", "numeric", "numeric", "numeric", 
-    "numeric", "character", "character", "character", "character", "character", "character", 
-    "character", "character", "character", "numeric", "numeric", "numeric", "numeric", "numeric", 
-    "numeric", "numeric", "numeric", "numeric", "numeric", "numeric", "character", "numeric", 
-    "character")
-Performance_Variables = 
-  c("LOAN_ID", "Monthly.Rpt.Prd", "Servicer.Name", "LAST_RT", "LAST_UPB", "Loan.Age", 
-    "Months.To.Legal.Mat" , "Adj.Month.To.Mat", "Maturity.Date", "MSA", "Delq.Status", 
-    "MOD_FLAG", "Zero.Bal.Code", "ZB_DTE", "LPI_DTE", "FCC_DTE","DISP_DT", "FCC_COST", 
-    "PP_COST", "AR_COST", "IE_COST", "TAX_COST", "NS_PROCS", "CE_PROCS", "RMW_PROCS", 
-    "O_PROCS", "NON_INT_UPB", "PRIN_FORG_UPB_FHFA", "REPCH_FLAG", "PRIN_FORG_UPB_OTH", 
-    "TRANSFER_FLG") %>% tolower
-
-dfiles = dir(raw_perf_data_path, full.names = T)
-short_dfiles = dir(raw_perf_data_path)
-
-Acquisitions_Variables = c("LOAN_ID", "ORIG_CHN", "Seller.Name", "ORIG_RT", "ORIG_AMT", "ORIG_TRM", "ORIG_DTE"
-                           ,"FRST_DTE", "OLTV", "OCLTV", "NUM_BO", "DTI", "CSCORE_B", "FTHB_FLG", "PURPOSE", "PROP_TYP"
-                           ,"NUM_UNIT", "OCC_STAT", "STATE", "ZIP_3", "MI_PCT", "Product.Type", "CSCORE_C", "MI_TYPE", "RELOCATION_FLG") %>% tolower()
-
-Acquisition_ColClasses = c("character", "character", "character", "numeric", "numeric", "integer", "character", "character", "numeric",
-                           "numeric", "character", "numeric", "numeric", "character", "character", "character", "character", "character",
-                           "character", "character", "numeric", "character", "numeric", "numeric", "character")
-
diff --git a/tutorials/fannie_mae_new/01a_read_from_csv.r b/tutorials/fannie_mae_new/01a_read_from_csv.r
deleted file mode 100644
index e97b7c65..00000000
--- a/tutorials/fannie_mae_new/01a_read_from_csv.r
+++ /dev/null
@@ -1,40 +0,0 @@
-source("tutorials/fannie_mae/00_setup.r")
-
-# number of rows to read in from each file in one go
-nreadin = NULL
-
-# your ram size in G
-# this is needed as memomry.limit seems broken 
-ram.size = 64
-nc = parallel::detectCores(logical = FALSE)
-conservatism = 2
-minchunks = nc
-# compression ratio, max = 100 for best compression but slower running speed
-# compress = 100
-
-# set up some variable for future use
-
-full_file_path = dir(raw_perf_data_path, full.names = T)
-
-
-# randomise it to maximize the chance of a good load balancing
-#set.seed(1)
-#full_file_path = sample(full_file_path, length(full_file_path))
-
-# convert CSV in parallel
-pt <- proc.time()
-tot_file_size = sum(file.size(full_file_path))/1024^3
-res = csv_to_disk.frame(
-    full_file_path[1:6], 
-    outdir = file.path(outpath, "raw_fannie_mae"), 
-    #shardby = "loan_id", 
-    colClasses = Performance_ColClasses, 
-    col.names = Performance_Variables, 
-    sep = "|", 
-    in_chunk_size = nreadin, 
-    nchunks = max(round(tot_file_size/ram.size*nc)*nc*conservatism, nc),
-    overwrite = TRUE,
-    .progress = TRUE # show progress
-)
-print(data.table::timetaken(pt))
-
diff --git a/tutorials/fannie_mae_new/01d_OPTIONAL_rechunk.r b/tutorials/fannie_mae_new/01d_OPTIONAL_rechunk.r
deleted file mode 100644
index 1ee3669c..00000000
--- a/tutorials/fannie_mae_new/01d_OPTIONAL_rechunk.r
+++ /dev/null
@@ -1,5 +0,0 @@
-library(disk.frame)
-
-fmdf <- disk.frame("fmdf")
-
-rechunk(fmdf, 128)
diff --git a/tutorials/fannie_mae_new/01e_read_data_for_harp.r b/tutorials/fannie_mae_new/01e_read_data_for_harp.r
deleted file mode 100644
index b9f1173c..00000000
--- a/tutorials/fannie_mae_new/01e_read_data_for_harp.r
+++ /dev/null
@@ -1,85 +0,0 @@
-#1c2
-source("tutorials/fannie_mae/00_setup.r")
-
-pt <- proc.time()
-
-# rows to read in one go
-rows_to_read = 1e7
-
-# load the Fannie Mae disk.frame
-fmdf <- disk.frame(file.path(outpath, "fm.df"))
-
-#system.time(harp <- fread("C:/data/HARP_Files/Performance_HARP.txt", colClasses = Performance_ColClasses, col.names = Performance_Variables))
-harp_mapping <- fread(file.path(raw_harp_data_path,"Loan_Mapping.txt"), colClasses = "c", col.names = c("loan_id", "harp_id"))
-setkey(harp_mapping, harp_id)
-fst::write_fst(harp_mapping,"harp_mapping.fst.tmp")
-fs::file_move("harp_mapping.fst.tmp", "harp_mapping.fst")
-print("reading in and saving HARP mapping file took: ")
-print(data.table::timetaken(pt))
-
-# took about 438.65 on laptop 500 chunks
-# took about 205 on desktop 56 chunks
-pt <- proc.time()
-system.time(
-  harp <- csv_to_disk.frame(file.path(raw_harp_data_path,"Performance_HARP.txt"), inmapfn = function(df) {
-    setnames(df, "loan_id", "harp_id")
-    
-    merge(df, harp_mapping, by="harp_id")
-  },
-  nchunks = nchunks(fmdf),
-  in_chunk_size = rows_to_read,
-  shardby = "loan_id",
-  outdir = file.path(outpath, "harp.df"),
-  colClasses = Performance_ColClasses,
-  col.names = Performance_Variables,
-  sep="|"))
-
-print("reading in and saving HARP mapping file took: ")
-print(data.table::timetaken(pt))
-
-
-if(F) {
-  # it can be seen that some accounts can start n harp the month the same that it ends in the dataset
-  harp1 = get_chunk.disk.frame(harp,1)
-  fmdf1 = get_chunk.disk.frame(fmdf,1)
-  
-  harp1[,date:=as.Date(monthly.rpt.prd,"%m/%d/%Y")]
-  fmdf1[,date:=as.Date(monthly.rpt.prd,"%m/%d/%Y")]
-  
-  fmdf2 = fmdf1[loan_id %in% unique(harp1$loan_id)]
-  
-  fmdf3 = merge(
-    fmdf2[,.(max_date = max(date)),loan_id],
-    harp1[,.(min_date = min(date)),loan_id],
-    by = "loan_id"
-  )
-  
-  fmdf3[max_date == min_date,]
-  
-  fmdf3[max_date > min_date,]
-  fmdf3[max_date <= min_date,]
-  
-  harp4 = harp[,.N, delq.status, keep="delq.status"]
-}
-
-
-# print(nrow(harp))
-# system.time(harp <- merge(harp, harp_mapping, by="harp_id"))
-# print(nrow(harp))
-# setkey(harp,"loan_id")
-# system.time(fst::write_fst(harp,"harp.fst")) # 16.63
-#system.time(fst::write_fst(harp,"harp100.fst",100)) # 391 seconds
-
-# check if the harp_id -> loan_id is successful
-if(F) {
-  harp_acq <- fread("C:/data/HARP_Files/Acquisition_HARP.txt")
-  fst::write_fst(harp_mapping,"harp_mapping.fst")
-  
-  harp[,date:=as.Date(month,"%m/%d/%Y")]
-  harp[,.N,date][order(date)]
-  fmdf = disk.frame("fmdf")
-  system.time(uid <- fmdf[,.(loan_id = unique(loan_id)), keep = "loan_id"][,unique(loan_id)])
-  harp_uid = unique(harp_mapped$loan_id)
-  def = intersect(uid, harp_uid)
-  def  
-}
\ No newline at end of file
diff --git a/tutorials/fannie_mae_new/02_exploratory.r b/tutorials/fannie_mae_new/02_exploratory.r
deleted file mode 100644
index 2a362d81..00000000
--- a/tutorials/fannie_mae_new/02_exploratory.r
+++ /dev/null
@@ -1,19 +0,0 @@
-# 2_exploratory.r
-source("inst/fannie_mae/0_setup.r")
-
-fmdf = disk.frame("fmdf0")
-
-system.time(a <- fmdf[,.N,delq.status,keep="delq.status"][,.(N=sum(N)),delq.status])
-setkey(a,delq.status); a
-
-str(a)
-
-str(head(fmdf))
-
-b = fmdf[,.N,monthly.rpt.prd,keep="monthly.rpt.prd"][,sum(N),monthly.rpt.prd]; b
-
-
-names(fmdf)
-servicer.name_cnt = fmdf[,.N,servicer.name,keep=c("servicer.name")]
-
-servicer.name_cnt[,sum(N),servicer.name]
diff --git a/tutorials/fannie_mae_new/02a_create_forward_looking_flag.r b/tutorials/fannie_mae_new/02a_create_forward_looking_flag.r
deleted file mode 100644
index 2d7dc6bc..00000000
--- a/tutorials/fannie_mae_new/02a_create_forward_looking_flag.r
+++ /dev/null
@@ -1,97 +0,0 @@
-# 2_exploratory.r
-source("inst/fannie_mae/0_setup.r")
-
-fmdf = disk.frame(file.path(outpath, "fm.df"))
-harp = disk.frame(file.path(outpath, "harp.df"))
-
-# harp contains the first date on which a loan enters into HARP
-pt <- proc.time()
-harp1 <- harp %>% 
-  srckeep(c("loan_id","monthly.rpt.prd")) %>% 
-  group_by(loan_id, hard = F) %>% # the data is sharded by loan_id hence hard = F is fine
-  summarise(first_harp_date = min(as.Date(monthly.rpt.prd, "%m/%d/%Y"))) %>% 
-  collect(parallel = T) # performs the collection in parallel
-
-cat(glue("Creating first harp date took: {timetaken(pt)}"))
-
-if(F) {
-  # data.table syntax
-  harp[,.(first_harp_date = min(as.Date(monthly.rpt.prd, "%m/%d/%Y"))), loan_id, 
-       keep=c("loan_id", "monthly.rpt.prd")]
-}
-
-harp1[,before_12m_first_harp_date:=first_harp_date]
-# 164 seconds
-system.time(lubridate::month(harp1$before_12m_first_harp_date) <- lubridate::month(harp1$before_12m_first_harp_date) - 12)
-harp1[,harp_12m := TRUE]
-
-# break it out into smaller chunks for even faster merging
-# took 27
-pt <- proc.time()
-harp2 <- shard(
-  harp1, 
-  "loan_id", 
-  nchunks = nchunks(fmdf), 
-  outdir = file.path(outpath, "first_harp_date.df"), 
-  overwrite=T)
-cat(glue::glue("sharding HARP defaults took {timetaken(pt)}"))
-
-# no need to hard group by it's already sharded by loan_id
-# took about 503 seconds
-# took about 11:52
-pt <- proc.time()
-defaults <- cmap(fmdf, function(df) {
-  # create the default flag
-  df[,date:=as.Date(monthly.rpt.prd,"%m/%d/%Y")]
-  setkey(df,loan_id, date)
-  df[,delq.status := as.integer(delq.status)]
-  df[,default:=delq.status>=3]
-  df[is.na(default),default:=F]
-  
-  df2 = df[default==T,.(loan_id, start_date =date, end_date = date)]
-  lubridate::month(df2$start_date) <- lubridate::month(df2$start_date)-12
-  
-  # get rid of overlaps
-  df2[order(start_date),lag_end_date := shift(end_date,1), loan_id]
-  
-  df2[,grp_incr := 1]
-  df2[start_date <= lag_end_date, grp_incr := 0, loan_id]
-  
-  df2[,grp:=cumsum(grp_incr)]
-  df3 = df2[,.(start_date = min(start_date), end_date = max(end_date), default_12m=T),.(loan_id,grp)]
-  
-  df3[,grp:=NULL]
-  
-  # create the hardship flag
-  # whether the customer goes into in the next 12 months hardship 
-  df3
-  }, 
-  keep = c("monthly.rpt.prd", "delq.status", "loan_id"), 
-  outdir = file.path(outpath, "defaults.df"),
-  lazy = F, 
-  overwrite = T)
-cat(glue::glue("Creating forward looking default flag took {timetaken(pt)}"))
-
-# 
-# a = df[loan_id == "100513171914", ]
-# 
-# a[,monthly.rpt.prd := as.Date(monthly.rpt.prd,"%m/%d/%Y")]
-# a[,monthly.rpt.prd2:=monthly.rpt.prd]
-# 
-# 
-# setkey(df3,"loan_id","start_date","end_date")
-# 
-# a1= foverlaps(a, df3, by.x = c("loan_id","monthly.rpt.prd","monthly.rpt.prd2"),
-#           by.y=c("loan_id","start_date","end_date"))
-# 
-# a1[,.(loan_id,monthly.rpt.prd,start_date, end_date)]
-
-# create a list that show defaults ----------------------------------------
-# system.time(defaults <- chunk_lapply(fmdf, function(df) {
-#   df[,date:=as.Date(monthly.rpt.prd,"%m/%d/%Y")]
-#   df[,delq.status := as.integer(delq.status)]
-#   df[,default:=delq.status>=3]
-#   df[is.na(default),default:=F]
-#   df1 = df[default == F]
-#   df1
-# }, outdir="tmp1"))
diff --git a/tutorials/fannie_mae_new/02b_foverlaps.r b/tutorials/fannie_mae_new/02b_foverlaps.r
deleted file mode 100644
index 215c7890..00000000
--- a/tutorials/fannie_mae_new/02b_foverlaps.r
+++ /dev/null
@@ -1,45 +0,0 @@
-# 2_exploratory.r
-source("tutorials/fannie_mae/00_setup.r")
-
-fmdf = disk.frame(file.path(outpath, "fm.df"))
-tmp2 = disk.frame(file.path(outpath, "defaults.df"))
-harp2 = disk.frame(file.path("first_harp_date.df"))
-
-fmdf_lazy = delayed(fmdf, function(df) {
-  df[,monthly.rpt.prd:=as.Date(monthly.rpt.prd,"%m/%d/%Y")]
-  df[,monthly.rpt.prd2:=monthly.rpt.prd]
-})
-
-tmp2_lazy = delayed(tmp2, function(df) {
-  setkey(df, "loan_id","start_date","end_date")
-})
-
-harp2_lazy = delayed(harp2, function(df) {
-  setkey(df, "loan_id","before_12m_first_harp_date","first_harp_date")
-})
-
-# took 513 seconds
-pt <- proc.time()
-tmp3 <- foverlaps.disk.frame(
-  fmdf_lazy, 
-  tmp2_lazy, 
-  outdir=file.path(outpath, "fm_with_default"), 
-  by.x = c("loan_id", "monthly.rpt.prd", "monthly.rpt.prd2"),
-  by.y = c("loan_id", "start_date", "end_date"),
-  merge_by_chunk_id = T,
-  overwrite = T
-)
-cat(glue::glue("time taken to merge on default flag {timetaken(pt)}"))
-
-# took 6:55
-pt <- proc.time()
-tmp4 <- foverlaps.disk.frame(
-  tmp3, 
-  harp2_lazy, 
-  outdir=file.path(outpath, "fm_with_harp"), 
-  by.x = c("loan_id", "monthly.rpt.prd", "monthly.rpt.prd2"),
-  by.y = c("loan_id", "before_12m_first_harp_date", "first_harp_date"),
-  merge_by_chunk_id = T,
-  overwrite = T
-)
-cat(glue::glue("time taken to merge on HARP flag {timetaken(pt)}"))
\ No newline at end of file
diff --git a/tutorials/fannie_mae_new/02b_foverlaps_Reduced16.r b/tutorials/fannie_mae_new/02b_foverlaps_Reduced16.r
deleted file mode 100644
index 825d28d4..00000000
--- a/tutorials/fannie_mae_new/02b_foverlaps_Reduced16.r
+++ /dev/null
@@ -1,68 +0,0 @@
-# 2_exploratory.r
-source("inst/fannie_mae/0_setup.r")
-pt <- proc.time()
-
-fmdf = disk.frame("only16.df")
-tmp2 = disk.frame("defaults.df") # tmp2 contains all the defaults
-harp2 = disk.frame("first_harp_date.df")
-
-fmdf_lazy = delayed(fmdf, function(df) {
-  df[,monthly.rpt.prd:=as.Date(monthly.rpt.prd,"%m/%d/%Y")]
-  df[,monthly.rpt.prd2:=monthly.rpt.prd]
-})
-
-tmp2_lazy = delayed(tmp2, function(df) {
-  setkey(df, "loan_id","start_date","end_date")
-})
-
-harp2_lazy = lazy(harp2, function(df) {
-  setkey(df, "loan_id","before_12m_first_harp_date","first_harp_date")
-})
-
-#dh <- rbindlist.disk.frame(tmp2_lazy, harp2_lazy, outdir ="defaults_harp.df", by_chunk_id = T)
-#plan(transparent)
-system.time(tmp3 <- foverlaps.disk.frame(
-  fmdf_lazy, 
-  tmp2_lazy, 
-  outdir="tmp3_16", 
-  by.x = c("loan_id", "monthly.rpt.prd", "monthly.rpt.prd2"),
-  by.y = c("loan_id", "start_date", "end_date"),
-  merge_by_chunk_id = T,
-  compress = 50
-))
-
-
-system.time(tmp4 <- foverlaps.disk.frame(
-  tmp3, 
-  harp2_lazy, 
-  outdir="tmp4_16", 
-  by.x = c("loan_id", "monthly.rpt.prd", "monthly.rpt.prd2"),
-  by.y = c("loan_id", "before_12m_first_harp_date", "first_harp_date"),
-  merge_by_chunk_id = T,
-  compress = 50
-))
-
-data.table::timetaken(pt)
-
-
-tmp4 <- disk.frame("tmp4_16")
-system.time(tmp5 <- tmp4[,.(ndef=sum(default_12m, na.rm=T),.N), .(monthly.rpt.prd, delq.status),
-     keep=c("default_12m","monthly.rpt.prd","delq.status")])
-
-
-tmp5[, delq.status.cap := pmin(5, as.integer(delq.status))]
-tmp5[delq.status == "X", delq.status.cap := 0]
-tmp5[delq.status == "", delq.status.cap := 0]
-
-
-tmp6 <- tmp5[,.(ndef=sum(ndef), N=sum(N)),.(monthly.rpt.prd, delq.status.cap)]
-
-
-tmp6[,defr:=ndef/N]
-
-library(ggplot2)
-tmp6 %>% 
-  ggplot +
-  geom_line(aes(x=monthly.rpt.prd, y = defr, colour=as.factor(delq.status.cap)))
-
-
diff --git a/tutorials/fannie_mae_new/02c_plot_odr.r b/tutorials/fannie_mae_new/02c_plot_odr.r
deleted file mode 100644
index 4521511a..00000000
--- a/tutorials/fannie_mae_new/02c_plot_odr.r
+++ /dev/null
@@ -1,45 +0,0 @@
-# 2_exploratory.r
-source("inst/fannie_mae/0_setup.r")
-
-fm_with_harp = disk.frame(file.path(outpath, "fm_with_harp"))
-
-head(fm_with_harp)
-
-# need a two stage summary
-system.time(a_wh1 <- fm_with_harp %>% 
-  srckeep(c("default_12m","harp_12m","monthly.rpt.prd")) %>% 
-  group_by(monthly.rpt.prd, hard = F) %>% 
-  summarise(
-    N = n(), 
-    n_defaults = sum(default_12m, na.rm = T),
-    n_goto_harp = sum(harp_12m, na.rm=T)) %>% 
-  collect(parallel  = T) %>%
-  group_by(monthly.rpt.prd) %>% 
-  summarise(
-    odr = sum(n_defaults)/sum(N),
-    oh = sum(n_goto_harp)/sum(N)
-  ) %>% 
-  rename(
-    Date = monthly.rpt.prd,
-    `Observed Default Rate%` = odr,
-    `HARP Conversion Rate%` = oh
-  ))
-  
-
-if(F) {
-  # data.table syntax
-  system.time(a_wh <- tmp4[
-    ,.(.N, n_defaults = sum(default_12m, na.rm = T), )
-    ,.(monthly.rpt.prd), keep=c("default_12m","harp_12m","monthly.rpt.prd")])
-  
-  a_wh1 = a_wh[,.(odr = sum(n_defaults)/sum(N), oh = sum(n_goto_harp)/sum(N)), monthly.rpt.prd]
-  setnames(a_wh1, c("monthly.rpt.prd","odr", "oh"), c("Date", "Observed Default Rate%", "HARP Conversion Rate%"))
-}
-
-a_wh2 = a_wh1 %>% gather(key = type, value=rate, -Date)
-
-ggplot(a_wh2) + 
-  geom_line(aes(x=Date, y = rate, colour = type)) +
-  ggtitle("Fannie Mae Observed Default Rate over time & HARP Conversion Rate")
-
-
diff --git a/tutorials/fannie_mae_new/02d_harp_check.r b/tutorials/fannie_mae_new/02d_harp_check.r
deleted file mode 100644
index f9fd020a..00000000
--- a/tutorials/fannie_mae_new/02d_harp_check.r
+++ /dev/null
@@ -1,33 +0,0 @@
-#2d_harp_check.r
-source("inst/fannie_mae/0_setup.r")
-
-system.time(harp <- fread("D:/data/fannie_mae/HARP_Files/Performance_HARP.txt", select = c("V1","V2"), colClasses = "c"))
-setnames(harp,names(harp), c("harp_id","month"))
-
-harp_mapping <- fread("D:/data/fannie_mae/HARP_Files/Loan_Mapping.txt", colClasses = "c", col.names = c("loan_id", "harp_id"))
-fst::write_fst(harp_mapping,"harp_mapping.fst")
-
-harp_mapped = merge(harp, harp_mapping, by="harp_id")
-c(nrow(harp), nrow(harp_mapped))
-
-harp_acq <- fread("d:/data/fannie_mae/HARP_Files/Acquisition_HARP.txt")
-fst::write_fst(harp_mapping,"harp_mapping.fst")
-
-harp[,date:=as.Date(month,"%m/%d/%Y")]
-harp[,.N,date][order(date)]
-
-fmdf = disk.frame("fmdf")
-
-system.time(uid <- fmdf[,.(loan_id = unique(loan_id)), keep = "loan_id"][,unique(loan_id)])
-
-harp_uid = unique(harp_mapped$loan_id)
-
-def = intersect(uid, harp_uid)
-def
-
-system.time(harp.df <- shard(harp,"loan_id", nchunks = nchunks(fmdf), outdir = "harp.df", overwrite = T))
-
-system.time(fmdfh <- rbindlist.disk.frame(list(fmdf, harp.df), outdir = "fmdf1_w_harp"))
-
-# nothing in the intersection
-# so either HARP ids were changed once there are in the harp program
\ No newline at end of file
diff --git a/tutorials/fannie_mae_new/10_a_one_var.R b/tutorials/fannie_mae_new/10_a_one_var.R
deleted file mode 100644
index fae7540c..00000000
--- a/tutorials/fannie_mae_new/10_a_one_var.R
+++ /dev/null
@@ -1,191 +0,0 @@
-source("inst/fannie_mae/0_setup.r")
-library(disk.frame)
-
-acqall1 = disk.frame(file.path(outpath, "appl_mdl_data_sampled_dev2"))
-
-# XGBoost on one var ------------------------------------------------------
-system.time(xy <- acqall1[,c("default_next_12m", "oltv"), keep=c("default_next_12m", "oltv")])
-
-# transform into format accepted by XGBoost
-dtrain <- xgb.DMatrix(label = xy$default_next_12m, data = as.matrix(xy[,"oltv"]))
-
-portfolio_default_rate = xy[,sum(default_next_12m, na.rm = T)/.N]
-
-pt = proc.time()
-m <- xgboost(
-  data=dtrain, 
-  nrounds = 1, 
-  objective = "binary:logitraw", 
-  tree_method="exact",
-  monotone_constraints = 1,
-  base_score = portfolio_default_rate)
-timetaken(pt)
-
-prev_pred = predict(m, dtrain)
-
-map_chr(xgb.dump(m), ~str_extract(.x,"\\[f0<[\\d]+\\.[\\d]+\\]")) %>% 
-  keep(~!is.na(.x)) %>% 
-  map_dbl(~str_extract(.x, "[\\d]+\\.[\\d]+") %>% as.numeric) %>% 
-  sort %>% 
-  floor -> bins
-
-bb = xy[,.(ndef = sum(default_next_12m), .N, m1 = min(oltv), m2 = max(oltv)), .(bins = cut(oltv,c(-Inf,bins,Inf)))]
-new_bins = sort(unique( bb$m2))
-bb = xy[,.(ndef = sum(default_next_12m), .N), .(bins = cut(oltv,c(-Inf,new_bins,Inf)))][order(bins)]
-
-setkey(bb, bins)
-bb %>%
-  filter(!is.na(bins)) %>% 
-  mutate(`Orig LTV Band` = bins, `Default Rate%` = ndef/N) %>% 
-  ggplot +
-  geom_bar(aes(x = `Orig LTV Band`, y = `Default Rate%`), stat = 'identity') + 
-  coord_flip()
-
-if(F) {
-  system.time(xyz <- acqall1[,c("default_next_12m", "oltv", "frst_dte"), keep=c("default_next_12m", "oltv", "frst_dte")])
-  
-  xyz[,frst_yr := substr(frst_dte,4,7) %>% as.integer]
-  xyz[,frst_dte:=NULL]
-  
-  bb = xyz[,.(ndef = sum(default_next_12m), .N), .(bins = cut(oltv,c(-Inf,bins,Inf)), frst_yr)]
-  
-  bb[,binton := sum(N), bins]
-  bb[,dr := ndef/binton]
-  
-  setkey(bb, bins)
-  bb[order(frst_yr,decreasing = T),text_y_pos := cumsum(dr) - dr/2, bins]
-  bb[,text := substr(frst_yr, 3,4)]
-  
-  bb %>%
-    filter(!is.na(bins)) %>% 
-    mutate(`Yr of Orig` = as.factor(frst_yr), `Orig LTV Band` = bins, `Default Rate%` = dr) %>% 
-    ggplot +
-    geom_bar(aes(x = `Orig LTV Band`, y = `Default Rate%`, fill = `Yr of Orig`), stat = 'identity') + 
-    geom_text(aes(x = `Orig LTV Band`, y = text_y_pos, label = text)) +
-    coord_flip()
-}
-
-
-# what if we fit them in rounded number ---------------------------------------------------------
-
-xy[oltv >  80, oltv_round := ceiling(oltv/5)*5]
-xy[oltv <= 80, oltv_round := ceiling(oltv/10)*10]
-xy[oltv <= 40, oltv_round := ceiling(oltv/20)*20]
-#xy = xy[!is.na(oltv_round),]
-
-xy[is.na(oltv_round),]
-
-xy[,.N, oltv_round]
-
-# transform into format accepted by XGBoost
-dtrain <- xgb.DMatrix(label = xy$default_next_12m, data = as.matrix(xy[,"oltv_round"]))
-
-pt = proc.time()
-m <- xgboost(
-  data=dtrain, 
-  nrounds = 1, 
-  objective = "binary:logitraw", 
-  tree_method="exact",
-  monotone_constraints = 1,
-  base_score = portfolio_default_rate)
-timetaken(pt)
-
-map_chr(xgb.dump(m), ~str_extract(.x,"\\[f0<[\\d]+[\\.]{0,1}[\\d]+\\]")) %>% 
-  keep(~!is.na(.x)) %>% 
-  map_dbl(~str_extract(.x, "[\\d]+[\\.]{0,1}[\\d]+") %>% as.numeric) %>% 
-  sort -> bins
-
-bb = xy[,.(ndef = sum(default_next_12m), .N, m1 = min(oltv_round), m2 = max(oltv_round)), .(bins = cut(oltv_round,c(-Inf,bins,Inf)))]
-new_bins = sort(unique( bb$m2))
-bb = xy[,.(ndef = sum(default_next_12m), .N), .(bins = cut(oltv_round,c(-Inf,new_bins,Inf)))][order(bins)]
-bb[,odr := ndef/N]
-
-setkey(bb, bins)
-bb %>%
-  filter(!is.na(bins)) %>% 
-  mutate(`Orig LTV Band` = bins, `Default Rate%` = ndef/N) %>% 
-  ggplot +
-  geom_bar(aes(x = `Orig LTV Band`, y = `Default Rate%`), stat = 'identity') + 
-  coord_flip()
-
-
-prev_pred = predict(m, dtrain)
-
-prev_pred1 = predict(m, dtrain, predcontrib=T)
-
-# add another variable ----------------------------------------------------
-target = "default_next_12m"
-feature = "orig_amt"
-df = acqall1
-format_fn = base::I
-existing_model = prev_pred
-monotone_constraints = -1
-
-# auc <- function(target, score) {
-#   df = data.table(target, score)
-#   
-#   df1 = df[,.(nt = sum(target), n = .N, score)]
-#   setkey(df1, score)
-# }
-# 
-# add_var_to_scorecard <- function(df, target, feature, monotone_constraints = 0, prev_pred = NULL, format_fn = base::I) {
-#   
-#   xy = df %>% 
-#     srckeep(c(target, feature)) %>%
-#     collect(parallel = T)
-#   
-#   # evaluate
-#   code = glue::glue("xy = xy %>% mutate({feature} = format_fn({feature}))")
-#   eval(parse(text = code))
-#   
-#   dtrain <- xgb.DMatrix(label = xy[,target, with = F][[1]], data = as.matrix(xy[,c(feature), with = F]))
-#   
-#   if(is.null(prev_pred)) {
-#     pt = proc.time()
-#     m2 <- xgboost(
-#       data=dtrain, 
-#       nrounds = 1, 
-#       objective = "binary:logitraw", 
-#       tree_method="exact",
-#       monotone_constraints = monotone_constraints
-#     )
-#     timetaken(pt)
-#   } else {
-#     setinfo(dtrain, "base_margin", prev_pred)
-#     pt = proc.time()
-#     m2 <- xgboost(
-#       data=dtrain, 
-#       nrounds = 1, 
-#       objective = "binary:logitraw", 
-#       tree_method="exact",
-#       monotone_constraints = monotone_constraints
-#     )
-#     timetaken(pt)
-#     
-#     a2 = predict(m2, dtrain)
-#     a3 = predict(m2, dtrain, predcontrib = T)
-#   }
-#   
-#   map_chr(xgb.dump(m2), ~str_extract(.x,"\\[f0<[\\d]+[\\.]{0,1}[\\d]+\\]")) %>% 
-#     keep(~!is.na(.x)) %>% 
-#     map_dbl(~str_extract(.x, "[\\d]+[\\.]{0,1}[\\d]+") %>% as.numeric) %>% 
-#     sort -> bins
-#   
-#   code = glue::glue("bb = xy[,.(ndef = sum({target}), .N, m1 = min({feature}), m2 = max({feature})), .(bins = cut({feature},c(-Inf,bins,Inf)))]")
-#   eval(parse(text = code))
-#   
-#   new_bins = sort(unique(bb$m2))
-#   code1 = glue::glue("bb = xy[,.(ndef = sum(default_next_12m), .N), .(bins = cut({feature},c(-Inf,new_bins,Inf)))][order(bins)]")
-#   eval(parse(text = code1))
-#   
-#   setkey(bb, bins)
-#   bb %>%
-#     filter(!is.na(bins)) %>% 
-#     mutate(`Orig LTV Band` = bins, `Default Rate%` = ndef/N) %>% 
-#     ggplot +
-#     geom_bar(aes(x = `Orig LTV Band`, y = `Default Rate%`), stat = 'identity') + 
-#     coord_flip()
-#   
-#   
-# }
-
diff --git a/tutorials/fannie_mae_new/10_a_one_var_with_fn.R b/tutorials/fannie_mae_new/10_a_one_var_with_fn.R
deleted file mode 100644
index 687abcc7..00000000
--- a/tutorials/fannie_mae_new/10_a_one_var_with_fn.R
+++ /dev/null
@@ -1,56 +0,0 @@
-source("inst/fannie_mae/0_setup.r")
-library(disk.frame)
-
-acqall1 = disk.frame(file.path(outpath, "appl_mdl_data_sampled_dev2"))
-
-pt <- proc.time()
-firstm <- add_var_to_scorecard(acqall1, "default_next_12m", "oltv", monotone_constraints = 1, format_fn = function(v) {
-  ceiling(v / 5) * 5
-})
-timetaken(pt)
-
-firstm
-
-# now add the second variable ---------------------------------------------
-pt <- proc.time()
-secondm <- add_var_to_scorecard(
-  acqall1, 
-  "default_next_12m", 
-  "dti", 
-  prev_pred = firstm$prev_pred, 
-  monotone_constraints = 1)
-timetaken(pt)
-
-pt <- proc.time()
-thirdm <- add_var_to_scorecard(
-  acqall1, 
-  "default_next_12m", 
-  "orig_trm", 
-  prev_pred = secondm$prev_pred, 
-  monotone_constraints = 1)
-timetaken(pt)
-
-pt <- proc.time()
-fourthm <- add_var_to_scorecard(
-  acqall1, 
-  "default_next_12m", 
-  "num_bo", 
-  prev_pred = thirdm$prev_pred, 
-  monotone_constraints = -1,
-  format_fn = as.numeric)
-timetaken(pt)
-
-pt <- proc.time()
-fifthm <- add_var_to_scorecard(
-  acqall1, 
-  "default_next_12m", 
-  "cscore_b", 
-  prev_pred = fourthm$prev_pred, 
-  monotone_constraints = -1)
-timetaken(pt)
-
-
-# plot improvement --------------------------------------------------------
-plot(map_dbl(list(firstm, secondm, thirdm, fourthm, fifthm), ~.x$auc), type="b", ylim = c(0.4,0.9))
-abline(h = 0.7, lty = 2)
-
diff --git a/tutorials/fannie_mae_new/10_b_two_vars.r b/tutorials/fannie_mae_new/10_b_two_vars.r
deleted file mode 100644
index d0df6cd4..00000000
--- a/tutorials/fannie_mae_new/10_b_two_vars.r
+++ /dev/null
@@ -1,61 +0,0 @@
-source("inst/fannie_mae/0_setup.r")
-library(disk.frame)
-
-acqall1 = disk.frame(file.path(outpath, "appl_mdl_data_sampled_dev2"))
-
-pt <- proc.time()
-firstm <- add_var_to_scorecard(
-  acqall1, 
-  "default_next_12m", 
-  "oltv", 
-  monotone_constraints = 1, 
-  format_fn = function(v) {
-    ceiling(v / 5) * 5
-  })
-timetaken(pt)
-
-firstm
-
-# now add the second variable ---------------------------------------------
-pt <- proc.time()
-secondm <- add_var_to_scorecard(
-  acqall1, 
-  "default_next_12m", 
-  "dti", 
-  prev_pred = firstm$prev_pred, 
-  monotone_constraints = 1)
-timetaken(pt)
-
-pt <- proc.time()
-thirdm <- add_var_to_scorecard(
-  acqall1, 
-  "default_next_12m", 
-  "orig_trm", 
-  prev_pred = secondm$prev_pred, 
-  monotone_constraints = 1)
-timetaken(pt)
-
-pt <- proc.time()
-fourthm <- add_var_to_scorecard(
-  acqall1, 
-  "default_next_12m", 
-  "num_bo", 
-  prev_pred = thirdm$prev_pred, 
-  monotone_constraints = -1,
-  format_fn = as.numeric)
-timetaken(pt)
-
-pt <- proc.time()
-fifthm <- add_var_to_scorecard(
-  acqall1, 
-  "default_next_12m", 
-  "cscore_b", 
-  prev_pred = fourthm$prev_pred, 
-  monotone_constraints = -1)
-timetaken(pt)
-
-
-# plot improvement --------------------------------------------------------
-#plot(map_dbl(list(firstm, secondm, thirdm, fourthm, fifthm), ~.x$auc), type="b", ylim = c(0.4,0.9))
-#abline(h = 0.7, lty = 2)
-
diff --git a/tutorials/fannie_mae_new/10a_AutoML_XGBoost_Scorecards.r b/tutorials/fannie_mae_new/10a_AutoML_XGBoost_Scorecards.r
deleted file mode 100644
index 6f7a471e..00000000
--- a/tutorials/fannie_mae_new/10a_AutoML_XGBoost_Scorecards.r
+++ /dev/null
@@ -1,90 +0,0 @@
-source("inst/fannie_mae/0_setup.r")
-library(disk.frame)
-
-acqall_dev = disk.frame(file.path(outpath, "appl_mdl_data_sampled_dev2"))
-  
-
-# 5d develop a function to test all variables
-check_which_is_best <- function(df, target, features, monotone_constraints, format_fns, weight=NULL) {
-  #browser()
-  prev_pred = NULL
-  ws = NULL
-  vars_scr = NULL
-  
-  while(length(features) > 0) {
-    # try every var
-    #browser()
-    res = furrr::future_map(1:length(features), ~add_var_to_scorecard(
-    #res = purrr::map(1:length(features), ~add_var_to_scorecard(
-      df, 
-      target,
-      features[.x], 
-      monotone_constraints = monotone_constraints[.x],
-      prev_pred = prev_pred,
-      format_fn = format_fns[[.x]],
-      weight,
-      save_model_fname = glue::glue("{features[.x]}.xgbm")
-    ))
-    
-    # which has the best auc
-    w = which.max(map_dbl(res, ~.x$auc))
-    ws = c(ws, w)
-    feature_to_fit = features[w]
-    print(glue::glue("choosen: {feature_to_fit}"))
-    
-    var_scr1 = res[[w]]
-    eval(parse(text = glue::glue("vars_scr = c(vars_scr, list({feature_to_fit} = var_scr1))")))
-    
-    features = features[-w]
-    format_fns = format_fns[-w]
-    monotone_constraints = monotone_constraints[-w]
-    
-    prev_pred = var_scr1$prev_pred
-  }
-  vars_scr
-}
-
-num_vars =     c("mi_pct", "orig_amt", "orig_rt", "ocltv", "mi_type", "cscore_c", "oltv", "dti", "orig_trm", "cscore_b", "num_bo", "num_unit", "orig_dte", "frst_dte")
-num_vars_mon = c(0       ,  1        , 1        , 1      , 0        ,  -1        , 1     , 1    ,     1     ,  -1        ,  -1     , 0         ,       0   ,  0)
-num_var_fmt_fn = c(map(1:10, ~base::I), map(1:2, ~as.numeric), map(1:2, ~function(x) {
-  as.numeric(substr(x, 4, 7))
-}))
-
-cat_vars = setdiff(names(acqall_dev), num_vars) %>% 
-  setdiff(
-    c("loan_id", "default_next_12m", "first_default_date", "mths_to_1st_default", "zip_3", "weight"))
-
-pt = proc.time()
-res_all = check_which_is_best(
-  acqall_dev, 
-  "default_next_12m", 
-  features = c(num_vars, cat_vars),
-  monotone_constraints = c(num_vars_mon, rep(0, length(cat_vars))),
-  format_fns = c(num_var_fmt_fn, map(1:length(cat_vars), ~base::I)),
-  weight = "weight"
-)
-timetaken(pt)
-
-plot(map_dbl(res_all, ~.x$auc))
-
-#saveRDS(res_all, "model.rds")
-
-if(F) {
-  rescat = check_which_is_best(
-    acqall_dev, 
-    "default_next_12m", 
-    c("seller.name", "oltv", "state", "dti", "mi_type"),
-    c(0            , 1     , 0      , 1    , 0     ),
-    map(1:5, ~base::I)
-  )
-  
-  
-  system.time(res <- check_which_is_best(
-    acqall1, 
-    "default_next_12m", 
-    c("ocltv", "cscore_c", "mi_type", "oltv", "dti", "orig_trm", "cscore_b", "num_bo"),
-    c(1      , -1        , 0        , 1     , 1    ,     1     , -1        ,  -1     ),
-    c(map(1:7, ~base::I), list(as.numeric))))
-}
-
-
diff --git a/tutorials/fannie_mae_new/10b_make_scorecard_visible.R b/tutorials/fannie_mae_new/10b_make_scorecard_visible.R
deleted file mode 100644
index 36fb832d..00000000
--- a/tutorials/fannie_mae_new/10b_make_scorecard_visible.R
+++ /dev/null
@@ -1,109 +0,0 @@
-source("inst/fannie_mae/0_setup.r")
-library(disk.frame)
-
-acqall_val = disk.frame(file.path(outpath, "appl_mdl_data_sampled_val2"))
-df = acqall_val
-#the model
-mdl = readRDS("model.rds")
-
-plot(purrr::map_dbl(mdl, ~.x$auc))
-
-#' score one variable
-score_one_var <- function(feature, x, bins, bias, format_fn = base::I) {
-  print(feature)
-  # numeric features (post format_fn) has only two columns in bins
-  if(ncol(bins) == 2) {
-    x1 = format_fn(x)
-    binsNA = evalparseglue("bins[is.na({feature}),]")
-    bins = evalparseglue("bins[!is.na({feature}),]")
-    score = bins$score[cut(x1, c(-Inf, bins[[2]], Inf))]
-    
-    if(nrow(binsNA) == 1) {
-      score[is.na(x1)] = binsNA[[1]]
-    }
-  } else {
-    x1 = format_fn(x)
-    x1df = evalparseglue("data.table({feature} = x1)")
-    x1df_scr = evalparseglue("merge(x1df, bins, by = '{feature}', all.x=T)")
-    score = x1df_scr$score
-  }
-  # if there are still na then assign the bias
-  #browser(expr = any(is.na(score)))
-  if(any(is.na(score))) {
-    warning(glue::glue("the feature `{feature}` has {sum(is.na(score))} unassigned scores out of {length(score)}, assigning them the bias = {bias}"))
-    score[is.na(score)] = bias
-  }
-  
-  score
-}
-
-
-#' Socre a scorecard
-#' @param mdl the xg scorecard model
-#' @param df a disk.frame
-score_xg_scorecard <- function(df, mdl) {
-  res = furrr::future_map_dfc(mdl, ~{
-  #res = purrr::map_dfc(mdl, ~{
-    var = df %>% 
-      srckeep(.x$feature) %>% 
-      collect(parallel = F)
-    x = var[[1]]
-    feature = .x$feature
-    bins = .x$bins
-    bias = .x$bias
-    format_fn = .x$format_fn
-    score_one_var(feature, x, bins, bias, format_fn)
-  }) %>% rowSums
-  res
-}
-
-plot_auc <- function(df) {
-  target = df %>% 
-    srckeep("default_next_12m") %>% 
-    collect(parallel = F)
-  
-  pt = proc.time()
-  target$score = score_xg_scorecard(df, mdl)
-  timetaken(pt)
-  
-  AUC = auc(target$default_next_12m, target$score)
-  GINI = 2*AUC-1
-  
-  target[,negscore := -score]
-  setkey(target, negscore)
-  cts = target[,.(ctot = (1:.N)/.N, cbad = cumsum(default_next_12m)/sum(default_next_12m))]
-  
-  plot(cts[seq(1,.N, length.out=100),], type="l", xlim=c(0,1),
-       main=glue::glue("On Validation - AUC: {AUC %>% round(2)}; GINI: {GINI %>% round(2)}"))
-  abline(a=0,b=1,lty=2)
-  abline(h=0)
-  abline(v=0)
-}
-
-
-system.time(plot_auc(acqall_val))
-
-# score on whole ----------------------------------------------------------
-acqall_dev = disk.frame(file.path(outpath, "appl_mdl_data_sampled_dev2"))
-system.time(plot_auc(acqall_dev))
-
-scorecard = map_dfr(mdl, ~{
-  res = .x$bins
-  evalparseglue("res[,feature_lbl := as.character({.x$feature})]")
-  res[,variable := .x$feature]
-  res %>% 
-    select(variable, feature_lbl, score ) %>% 
-    mutate(score = round(-score*20/log(2)))
-})
-
-saveRDS(scorecard, "scorecard.rds")
-scorecard = readRDS("scorecard.rds")
-View(scorecard)
-
-DT::datatable(scorecard)
-
-# score on whole ----------------------------------------------------------
-# acqall = disk.frame(file.path(outpath, "appl_mdl_data"))
-# system.time(plot_auc(acqall))
-
-
diff --git a/tutorials/fannie_mae_new/11_speedglm.R b/tutorials/fannie_mae_new/11_speedglm.R
deleted file mode 100644
index a7355ac9..00000000
--- a/tutorials/fannie_mae_new/11_speedglm.R
+++ /dev/null
@@ -1,50 +0,0 @@
-source("inst/fannie_mae/0_setup.r")
-library(disk.frame)
-library(speedglm)
-library(biglm)
-
-acqall_dev = disk.frame(file.path(outpath, "appl_mdl_data_sampled_dev2"))
-library(speedglm)
-
-head(acqall_dev)
-
-#' A streaming function for speedglm
-#' @param df a disk.frame
-stream_shglm <- function(df) {
-  i = 0
-  is = sample(nchunks(df), replace = F)
-  function(reset = F) {
-    #browser()
-    if(reset) {
-      print("you've reset")
-      i <<- 0
-    } else {
-      i <<- i + 1
-      #if (i > 3) {
-      if (i > nchunks(df)) {
-        return(NULL)
-      }
-      print(glue::glue("{i}/{nchunks(df)}"))
-      return(get_chunk(df, is[i]))
-      #return(get_chunk(df, i))
-    }
-  }
-}
-
-acqall_dev1 = acqall_dev %>% delayed(~{
-  .x[,oltv_band := cut(oltv, c(-Inf, seq(60,100,by=20)))]
-  .x[,dti_band := addNA(cut(dti, c(-Inf, c(20,100), Inf)))]
-  .x
-})
-
-streamacq = stream_shglm(acqall_dev1)
-
-m = bigglm(
-  default_next_12m ~ oltv_band-1, 
-  data = streamacq, 
-  family=binomial())
-summary(m)
-
-shglm(default_next_12m ~ oltv_band -1 #+ dti_band + - 1
-      , 
-      datafun = streamacq, family=binomial())
\ No newline at end of file
diff --git a/tutorials/fannie_mae_new/12_keras.r b/tutorials/fannie_mae_new/12_keras.r
deleted file mode 100644
index ba164e2c..00000000
--- a/tutorials/fannie_mae_new/12_keras.r
+++ /dev/null
@@ -1,178 +0,0 @@
-source("inst/fannie_mae/0_setup.r")
-library(disk.frame)
-library(keras)
-
-acqall_dev = disk.frame(file.path(outpath, "appl_mdl_data_sampled_dev2"))
-
-#' A streaming function for speedglm
-#' @param df a disk.frame
-stream_shglm <- function(df) {
-  i = 0
-  is = sample(nchunks(df), replace = F)
-  function(reset = F) {
-    #browser()
-    if(reset) {
-      print("you've reset")
-      i <<- 0
-    } else {
-      i <<- i + 1
-      #if (i > 4) {
-      if (i > nchunks(df)) {
-        return(NULL)
-      }
-      print(glue::glue("{i}/{nchunks(df)}"))
-      return(get_chunk(df, is[i]))
-      #return(get_chunk(df, i))
-    }
-  }
-}
-
-acqall_dev1 = acqall_dev %>% delayed(~{
-  .x[,oltv_band := cut(oltv, c(-Inf, 60,80, Inf))]
-  #.x[,scr_band := addNA(cut(cscore_b, c(-Inf, 627,700, Inf)), ifany=F)]
-  .x[,scr_band := addNA(cut(cscore_b, c(-Inf, 700,716,725,742,748,766,794, Inf)), ifany=F)]
-  
-  .x
-})
-
-if(F) {
-  aa = acqall_dev1 %>% collect
-  glm(default_next_12m ~ oltv_band + scr_band - 1, data=aa)
-}
-
-head(acqall_dev1)
-
-build_model <- function() {
-  model <- keras_model_sequential() %>%
-    layer_dense(units = 2, activation = 'softmax')
-  
-  
-  model %>% compile(
-    loss = "categorical_crossentropy",
-    optimizer = 'sgd'
-  )
-  
-  model
-}
-
-model = build_model()
-
-#ol = levels(acqall_dev1[,oltv_band, keep=c("oltv","cscore_b")])
-#dl = levels(acqall_dev1[,scr_band, keep=c("oltv","cscore_b")])
-
-ol = levels(get_chunk(acqall_dev1,1)[,oltv_band])
-dl = levels(get_chunk(acqall_dev1,1)[,scr_band])
-
-
-#for(i in 1:nchunks(acqall_dev1)) {
-kk <- function() {
-  #browser()
-  j = 0
-  done = F
-  ii = 0
-  while(!done) {
-    j <- j + 1
-    si = sample(nchunks(acqall_dev1), nchunks(acqall_dev1)*0.7)
-    osi = setdiff(1:nchunks(acqall_dev1), si)
-    #browser()
-    system.time(a <- map_dfr(si, ~{
-      #browser()
-      ii <- ii + 1
-      i = .x
-      if(ii %% 20 == 0) print(glue::glue("{j}:{ii} {Sys.time()}"))
-      
-      a = get_chunk(acqall_dev1, i)
-      #a[,.(sum(default_next_12m), .N),oltv_band]
-      
-      a1 = 
-        cbind(
-          a[,keras::to_categorical(as.integer(oltv_band) - 1)]
-          #,a[,keras::to_categorical(as.integer(scr_band) - 1)]
-        )
-      
-      at = a[,default_next_12m*1]
-      Y_train = keras::to_categorical(at)
-      
-      hist = model %>% fit(
-        a1,
-        Y_train,
-        epochs = 1,
-        validation_split = 0.2,
-        verbose = 0
-      )
-      
-      gw = get_weights(model)
-      
-      gwdt = as.data.table(gw[1])
-      setnames(gwdt, names(gwdt), c("non_default", "default"))
-      gwdt$i =i
-      gwdt$var = c(
-        rep("oltv_band", length(ol))
-        #, rep("scr_band", length(dl))
-      )
-      gwdt$band = c(ol,dl
-      )
-      gwdt
-    }))
-    
-    some_chunks <- map(osi, ~{
-      i = .x
-      a = get_chunk(acqall_dev1, i)
-      
-      a1 = 
-        cbind(
-          a[,keras::to_categorical(as.integer(oltv_band) -1)]
-          #,a[,keras::to_categorical(as.integer(scr_band)-1)]
-        )
-      a1
-    }) %>% reduce(rbind)
-    
-    outcomes <- map(osi, ~{
-      i = .x
-      a = get_chunk(acqall_dev, i, keep=c("default_next_12m"))
-      a[,default_next_12m]
-    }) %>% unlist
-    
-    auc = auc(outcomes, predict(model, some_chunks)[,2])
-    
-    # scores
-    p = predict(model, diag(
-      length(ol)
-      #+length(dl)
-      ))[,2]
-    a_b = log(p/(1-p))
-    
-    scalar = 20/log(2)
-    intercept = 20*log(536870912/15)/log(2)
-    
-    a = mean(a_b)
-    b = a_b - a
-    done <- length(unique(sign(diff(b)))) == 1
-    
-    scrs = data.table(base_scr = round(a*scalar+intercept,0), scr = round(-b*scalar,0))
-    scrs$var = c(
-      rep("oltv_band", length(ol))
-      #, rep("scr_band", length(dl))
-    )
-    scrs$band = c(ol
-                  , dl
-    )
-    scrs = scrs[band != "bias"]
-    
-    scrs = scrs[,.(var, band, scr, base_scr)]
-    print(glue::glue("AUC: {auc}"))
-    print(scrs)
-  }  
-  scrs
-}
-
-pt = proc.time()
-scrs = kk()
-timetaken(pt)
-View(scrs)
-
-#AUC: 0.599790504924901
-# var      band scr base_scr
-# 1: oltv_band (-Inf,60]  10      351
-# 2: oltv_band   (60,80]  10      351
-# 3: oltv_band (80, Inf] -21      351
\ No newline at end of file
diff --git a/tutorials/fannie_mae_new/13_disk.r b/tutorials/fannie_mae_new/13_disk.r
deleted file mode 100644
index 10adc926..00000000
--- a/tutorials/fannie_mae_new/13_disk.r
+++ /dev/null
@@ -1,25 +0,0 @@
-source("inst/fannie_mae/0_setup.r")
-library(disk.frame)
-library(keras)
-
-fmdf = disk.frame(file.path(outpath, "fm_with_harp"))
-
-head(fmdf)
-
-nrow(fmdf)
-
-pt = proc.time()
-default_rate_over_time <- fmdf %>% 
-  srckeep(c("default_12m", "monthly.rpt.prd")) %>% 
-  group_by(monthly.rpt.prd, hard = F) %>% 
-  summarise(ndef = sum(default_12m, na.rm=T), n = n()) %>% 
-  collect %>% 
-  group_by(monthly.rpt.prd) %>% 
-  summarise(ndef = sum(ndef), n = sum(n)) %>% 
-  mutate(default_rate = ndef/n)
-timetaken(pt)
-
-default_rate_over_time %>% 
-  ggplot +
-  geom_line(aes(x = monthly.rpt.prd, y = default_rate)) +
-  ggtitle("Fannie Mae Default Rate%")
diff --git a/tutorials/fannie_mae_new/3a_get_some_data.r b/tutorials/fannie_mae_new/3a_get_some_data.r
deleted file mode 100644
index b86128a8..00000000
--- a/tutorials/fannie_mae_new/3a_get_some_data.r
+++ /dev/null
@@ -1,113 +0,0 @@
-library(disk.frame)
-
-source("inst/fannie_mae/0_setup.r")
-
-a = disk.frame(file.path(outpath, "fm_with_harp"))
-
-pt = proc.time()
-a1 <- a %>% 
-  srckeep(c("loan_id","monthly.rpt.prd","delq.status", "default_12m")) %>% 
-  delayed(~{
-  setkey(.x, loan_id)
-  uid = sample_frac(.x[,.(loan_id = unique(loan_id))], 0.01)
-  .x[uid,]
-}) %>% collect(parallel = T)
-timetaken(pt)
-
-a1[delq.status != "X",delq.statusn := as.numeric(delq.status)]
-a1[delq.status %in% c("","X"),delq.statusn := 0]
-
-#a1[, delq.statusn := pmin(delq.statusn, 3)]
-
-a1[,.N,delq.statusn]
-
-a1[is.na(delq.statusn)]
-
-# remove those that have already defulat
-a2 = a1
-
-a2[is.na(default_12m), default_12m := FALSE]
-
-a3 = a2[,.(loan_id, monthly.rpt.prd, delq.statusn, default_12m)]
-setkey(a3, loan_id, monthly.rpt.prd)
-
-a3[,sum(default_12m)/.N,delq.statusn]
-
-# create worst delq status last 12 months
-system.time(a4 <- a3[, shift(delq.statusn,n=1:12), by=loan_id])
-
-eval(parse(text=glue::glue("a4[,worst_delq_last_12m := pmax({paste0(paste0('V', 1:12), collapse=',')}, na.rm=T)]")))
-
-a5 = bind_cols(a3, a4[,.(worst_delq_last_12m)])
-
-a6 = a5#[delq.statusn < 3,]
-
-a6[,worst_delq_last_12m_capped := pmin(worst_delq_last_12m, 6)]
-
-a6[,.N, worst_delq_last_12m_capped]
-
-a6devid = sample_frac(a6[,.(loan_id = unique(loan_id))], 0.7)
-
-a6dev = a6[a6devid]
-a6val = a6[!a6devid]
-
-a6devpd = a6dev[,.(pd = sum(default_12m)/.N), worst_delq_last_12m_capped]
-
-a6val = left_join(a6val, a6devpd, by = "worst_delq_last_12m_capped")
-a6dev = left_join(a6dev, a6devpd, by = "worst_delq_last_12m_capped")
-
-setkey(a6val, loan_id, monthly.rpt.prd)
-setkey(a6dev, loan_id, monthly.rpt.prd)
-
-for_write <- function(a6val) {
-  a6val1 = a6val[as.Date("2015-06-01") < monthly.rpt.prd &  monthly.rpt.prd <= as.Date("2016-06-01") & !is.na(worst_delq_last_12m_capped), ]
-  a6val1[,N:=.N,loan_id]
-  
-  a6val2 = a6val1[N == 12,]
-  setkey(a6val2, loan_id, monthly.rpt.prd)
-  
-  toremove = a6val2[,.SD[12,delq.statusn], loan_id]
-   
-  toremove1 = toremove[V1 >= 3, loan_id]
-  
-  a6val2 = a6val2[!(loan_id %in% toremove1), ]
-  
-  a6val2[order(loan_id), id := rleid(loan_id)]
-  setkey(a6val2, loan_id, monthly.rpt.prd)
-  a6val2
-}
-
-
-a6val2 <- for_write(a6val)
-a6dev2 <- for_write(a6dev)
-
-
-a6dev2[,.N, delq.statusn]
-a6val2[,.N, delq.statusn]
-
-fwrite(a6val2[,.(id, delq_status = delq.statusn, default_12m)], "val.csv")
-fwrite(a6dev2[,.(id, delq_status = delq.statusn, default_12m)], "dev.csv")
-
-
-
-gini <- function(a6val) {
-  setkey(a6val, id, monthly.rpt.prd)
-  a6val = a6val[seq(12,.N, by=12),]
-  a6_auc = a6val[order(worst_delq_last_12m_capped), .(bads = sum(default_12m), tots =.N), pd]
-  
-  a6_auc[,score:=-pd]
-  setkey(a6_auc, score)
-  
-  a6_auc[order(score), height := bads/sum(bads)]
-  a6_auc[order(score), width := tots/sum(tots)]
-  plot(a6_auc[,.(cumsum(width), cumsum(height))], type="l")
-  abline(0,1)
-  a6_auc2 = bind_rows(data.table(height=0, width=0), a6_auc)
-  a6_auc2[,cheight := cumsum(height)]
-  a6_auc2[,cwidth := cumsum(width)]
-  2*a6_auc2[,sum((cheight+lag(cheight))*c(0,diff(cwidth))/2, na.rm=T)]-1
-}
-
-gini(a6val2)
-gini(a6dev2)
-
diff --git a/tutorials/fannie_mae_new/4a_read_appl.r b/tutorials/fannie_mae_new/4a_read_appl.r
deleted file mode 100644
index 29483b2f..00000000
--- a/tutorials/fannie_mae_new/4a_read_appl.r
+++ /dev/null
@@ -1,59 +0,0 @@
-source("tutorials/fannie_mae/0_setup.r")
-library(disk.frame)
-
-fs::dir_delete(file.path(outpath, "acq.df"))
-
-
-Acquisitions_Variables = c("LOAN_ID", "ORIG_CHN", "Seller.Name", "ORIG_RT", "ORIG_AMT", "ORIG_TRM", "ORIG_DTE"
-                           ,"FRST_DTE", "OLTV", "OCLTV", "NUM_BO", "DTI", "CSCORE_B", "FTHB_FLG", "PURPOSE", "PROP_TYP"
-                           ,"NUM_UNIT", "OCC_STAT", "STATE", "ZIP_3", "MI_PCT", "Product.Type", "CSCORE_C", "MI_TYPE", "RELOCATION_FLG") %>% tolower()
-
-Acquisition_ColClasses = c("character", "character", "character", "numeric", "numeric", "integer", "character", "character", "numeric",
-                           "numeric", "character", "numeric", "numeric", "character", "character", "character", "character", "character",
-                           "character", "character", "numeric", "character", "numeric", "numeric", "character")
-
-
-
-pt = proc.time()
-acq <- zip_to_disk.frame(
-  acqzip_file_path, 
-  outdir =  file.path(outpath, "acq.df"),
-  col.names = Acquisitions_Variables, 
-  colClasses = Acquisition_ColClasses,
-  shardby = "loan_id",
-  nchunks = nchunks(fmdf))
-timetaken(pt)
-
-pt = proc.time()
-acqall <- rbindlist.disk.frame(acq, outdir = file.path(outpath, "acq_all.df"))
-timetaken(pt)
-
-# took 50 minutes
-
-fmdf <- disk.frame(file.path(outpath,"fm_with_harp"))
-
-
-pt = proc.time()
-fmdf_all = left_join(
-  fmdf, 
-  acqall,
-  by = "loan_id", 
-  merge_by_chunk_id = T,
-  outdir = file.path(outpath, "fmdf_appl")
-  )
-timetaken(pt)
-
-acqall = disk.frame(file.path(outpath, "acq_all.df"))
-
-fmdf <- disk.frame(file.path(outpath,"fm_with_harp"))
-
-
-for(i in 1:nchunks(fmdf)) {
-  print(i)
-  a1 = get_chunk(acqall,i, keep="loan_id") %>% unique
-  a2 = get_chunk(fmdf,i, keep="loan_id") %>% unique
-  stopifnot(nrow(a1) == nrow(a2))
-  stopifnot(nrow(setdiff(a1,a2)) == 0)
-  stopifnot(nrow(setdiff(a2,a1)) == 0)
-}
-
diff --git a/tutorials/fannie_mae_new/4b_exploratory.R b/tutorials/fannie_mae_new/4b_exploratory.R
deleted file mode 100644
index 1cf4382d..00000000
--- a/tutorials/fannie_mae_new/4b_exploratory.R
+++ /dev/null
@@ -1,92 +0,0 @@
-source("inst/fannie_mae/0_setup.r")
-library(disk.frame)
-#future::plan(multiprocess(workers = availableCores()))
-
-fmdf_all = disk.frame(file.path(outpath, "fmdf_appl"))
-
-head(fmdf_all)
-
-# how many delq.status are there ------------------------------------------
-# system.time(dscnt <- fmdf_all[,.N,delq.status, keep="delq.status"])
-#dscnt[order(delq.status),sum(N),delq.status %>% as.integer]
-
-system.time(dscnt2 <- fmdf_all %>% 
-              srckeep(c("delq.status", "default_12m", "harp_12m")) %>% 
-              group_by(delq.status, hard = F) %>% 
-              summarise(n = n(), ndef = sum(default_12m, na.rm=T), nharp = sum(harp_12m, na.rm=T)) %>% 
-              collect(parallel = T) %>% 
-              group_by(delq.status) %>% 
-              summarise(n = sum(n), ndef = sum(ndef), nharp = sum(nharp)) %>% 
-              arrange(delq.status))
-
-
-dscnt2[delq.status %in% c("X","","0","1","2"),.(delq.status, ndef/n)]
-
-# how many default and go to HARP -----------------------------------------
-# system.time(hd <- fmdf_all[,.N, .(harp_12m,default_12m), keep=c("harp_12m","default_12m")])
-# hd[,.(N = sum(N)), .(harp_12m, default_12m)]
-
-# 45 seconds
-system.time(hd <- fmdf_all %>% 
-  srckeep(c("harp_12m", "default_12m", "delq.status")) %>% 
-  filter(delq.status %in% c("X","","0","1","2")) %>% 
-  group_by(harp_12m, default_12m, hard = F) %>% 
-  summarise(n = n()) %>% 
-  collect(parallel = T) %>% 
-  group_by(harp_12m, default_12m) %>% 
-  summarise(n = sum(n)))
-
-# Source: local data table [4 x 3]
-# Groups: harp_12m
-# 
-# # A tibble: 4 x 3
-# harp_12m default_12m          n
-# <lgl>    <lgl>            <int>
-#   1 NA       NA          1765366501
-# 2 TRUE     NA            12343954
-# 3 NA       TRUE          22366382
-# 4 TRUE     TRUE               638
-
-# how many accounts eventually default ------------------------------------
-#future::plan(multiprocess(workers = availableCores()))
-pt = proc.time() # 4:26
-simple2 = fmdf_all %>%
-  srckeep(c("oltv", "default_12m", "harp_12m", "loan_id", "orig_dte", "delq.status")) %>%
-  #filter(delq.status %in% c("0","X","1","2")) %>% 
-  mutate(orig_yr = substr(orig_dte, 4,7)) %>% 
-  group_by(oltv, loan_id, orig_yr, hard = F) %>%
-  summarise(ndef = sum(default_12m, na.rm=T), n = n()) %>%
-  collect(parallel = T) %>%
-  group_by(oltv, loan_id, orig_yr) %>%
-  summarise(ndef = sum(ndef), n = sum(n))
-timetaken(pt)
-
-stopifnot(simple2[,n_distinct(loan_id) == .N])
-
-#simple3 = simple2[,.(ndef = sum(ndef > 0), .N), .(orig_yr_band = cut(as.numeric(orig_yr),c(-Inf, 2005, 2007, 2008,2009,Inf)), oltv_band = ceiling(oltv/10)*10)]
-
-simple3 = simple2[,.(ndef = sum(ndef > 0), .N), .(orig_yr, oltv_band = cut(oltv, c(-Inf, seq(0,80,by=20),Inf)))]
-
-simple3 %>% 
-  mutate(odr = ndef/N) %>% 
-  select(oltv_band, odr, orig_yr) %>% 
-  spread(key = oltv_band, value = odr)
-
-simple3 %>% 
-  filter(!is.na(oltv_band)) %>% 
-  mutate(odr = ndef/N) %>%
-  arrange(oltv_band) %>% 
-  ggplot +
-  geom_bar(
-    aes(
-      x = as.factor(oltv_band), 
-      weight = odr, 
-      colour = orig_yr),
-      position = "dodge")
-
-
-
-
-
-
-
diff --git a/tutorials/fannie_mae_new/4c_explore_xgboost.r b/tutorials/fannie_mae_new/4c_explore_xgboost.r
deleted file mode 100644
index f6a53cba..00000000
--- a/tutorials/fannie_mae_new/4c_explore_xgboost.r
+++ /dev/null
@@ -1,44 +0,0 @@
-source("inst/fannie_mae/0_setup.r")
-library(disk.frame)
-
-vars_to_keep = c(
-  Performance_Variables[1:15] %>% tolower,
-  Acquisitions_Variables[-1] %>% tolower,
-  c("default_12m", "harp_12m")
-) %>% unique
-
-# create a new variable dh12m which is a concatenation of default in next 12 months or going into harp next 12 months
-fmdf_all = disk.frame(file.path(outpath, "fmdf_appl")) %>% 
-  srckeep(vars_to_keep) %>%
-  delayed(~{
-    .x[,dh12m := default_12m | harp_12m]
-    .x[is.na(dh12m), dh12m:=F]
-    .x
-  })
-
-fmdf1 = disk.frame(file.path(outpath, "fmdf_appl"))
-
-# took 2 mins
-system.time(uid <- unique(fmdf1[substr(orig_dte ,4,7) >= "2014",unique(loan_id), keep=c("loan_id","orig_dte")]))
-
-# set.seed(1)
-# suid = sample(uid, length(uid)/100)
-
-uiddf = data.table(loan_id = uid)
-setkey(uiddf, loan_id)
-
-#fmdf_all = disk.frame(file.path(outpath, "fmdf_appl"))
-# took about 10 minutes
-system.time(fmdf_2yr <- fmdf_all %>%
-  map.disk.frame(~{
-    setkey(.x, loan_id)
-    merge(.x, uiddf, by="loan_id")
-  }, 
-  lazy = F, 
-  outdir = file.path(outpath,"fmdf_2yr"),
-  overwrite = T))
-
-# fmdf_2yr = disk.frame(file.path(outpath,"fmdf_2yr"))
-system.time(sfmdf_2yr <- sample_frac(fmdf_2yr, 1) %>% collect(parallel = F))
-
-
diff --git a/tutorials/fannie_mae_new/4d_xgboost.r b/tutorials/fannie_mae_new/4d_xgboost.r
deleted file mode 100644
index 248e676a..00000000
--- a/tutorials/fannie_mae_new/4d_xgboost.r
+++ /dev/null
@@ -1,151 +0,0 @@
-source("inst/fannie_mae/0_setup.r")
-library(disk.frame)
-fmdf_2yr = disk.frame(file.path(outpath,"fmdf_2yr"))
-
-system.time(xy <- fmdf_2yr[,.(dh12m, oltv),keep=c("dh12m","oltv")])
-
-
-# show how to binning -----------------------------------------------------
-system.time(xy <- fmdf_2yr[,c("dh12m", "oltv"), keep=c("dh12m","oltv")])
-
-dtrain <- xgb.DMatrix(label = xy$dh12m, data = as.matrix(xy[,"oltv"]))
-
-pt = proc.time()
-m <- xgboost(
-  data=dtrain, 
-  nrounds = 1, 
-  objective = "binary:logitraw", 
-  tree_method="exact",
-  monotone_constraints = 1,
-  base_score = 32662/10067411)
-timetaken(pt)
-
-
-map_chr(xgb.dump(m), ~str_extract(.x,"\\[f0<[\\d]+\\.[\\d]+\\]")) %>% 
-  keep(~!is.na(.x)) %>% 
-  map_dbl(~str_extract(.x, "[\\d]+\\.[\\d]+") %>% as.numeric) %>% 
-  sort %>% 
-  floor -> xy[,"oltv"]
-
-bb = xy[,sum(dh12m)/.N, .(bins = cut(oltv,c(-Inf,bins,Inf)))]
-
-setkey(bb, bins)
-
-barplot(height = bb$V1, names.arg = bb$bins)
-
-pt = proc.time()
-m2 <- xgboost(
-  data=x, 
-  label = y, 
-  nrounds = 3, 
-  objective = "binary:logitraw", 
-  tree_method="exact",
-  monotone_constraints = 1,
-  base_score = 32662/10067411)
-timetaken(pt)
-
-
-map_chr(xgb.dump(m2), ~str_extract(.x,"\\[f0<[\\d]+\\.[\\d]+\\]")) %>% 
-  keep(~!is.na(.x)) %>% 
-  map_dbl(~str_extract(.x, "[\\d]+\\.[\\d]+") %>% as.numeric) %>% 
-  sort %>% 
-  floor %>% 
-  unique -> bins
-
-bb = xy[,sum(dh12m)/.N, .(bins = cut(oltv,c(-Inf,bins,Inf)))]
-
-setkey(bb, bins)
-
-barplot(height = bb$V1, names.arg = bb$bins)
-
-print(length(bins))
-
-xgb.plot.tree(model = m)
-
-
-# with penalty ------------------------------------------------------------
-system.time(xy <- fmdf_2yr[,.(dh12m, oltv),keep=c("dh12m","oltv")])
-
-dtrain <- xgb.DMatrix(label = xy$dh12m, data = as.matrix(xy$oltv))
-
-system.time(xycv <- 
-              xgb.cv(dtrain, objective="binary:logitraw", nfold = 5, nrounds=1))
-xycv
-
-pt = proc.time()
-m <- xgboost(
-  data=dtrain, 
-  nrounds = 1, 
-  objective = "binary:logitraw", 
-  tree_method="exact",
-  monotone_constraints = 1,
-  base_score = 32662/10067411
-)
-timetaken(pt)
-
-
-map_chr(xgb.dump(m), ~str_extract(.x,"\\[f0<[\\d]+\\.[\\d]+\\]")) %>% 
-  keep(~!is.na(.x)) %>% 
-  map_dbl(~str_extract(.x, "[\\d]+\\.[\\d]+") %>% as.numeric) %>% 
-  sort %>% 
-  floor -> bins
-
-bb = xy[,sum(dh12m)/.N, .(bins = cut(oltv,c(-Inf,bins,Inf)))]
-
-setkey(bb, bins)
-
-barplot(height = bb$V1, names.arg = bb$bins)
-
-xgb.plot.tree(model = m)
-
-# do a variable with NA ---------------------------------------------------
-system.time(xy <- fmdf_2yr[,.(dh12m, mi_pct),keep=c("dh12m","mi_pct")])
-
-dtrain <- xgb.DMatrix(label = xy$dh12m, data = as.matrix(xy$mi_pct))
-
-pt = proc.time()
-m <- xgboost(
-  data= dtrain, 
-  nrounds = 1, 
-  objective = "binary:logitraw", 
-  tree_method="exact",
-  monotone_constraints = 1,
-  base_score = 32662/10067411)
-timetaken(pt)
-
-
-map_chr(xgb.dump(m), ~str_extract(.x,"\\[f0<[\\d]+\\.[\\d]+\\]")) %>% 
-  keep(~!is.na(.x)) %>% 
-  map_dbl(~str_extract(.x, "[\\d]+[\\.[\\d]+]") %>% as.numeric) %>% 
-  sort %>% 
-  floor %>% 
-  unique -> bins
-
-bb = xy[,sum(dh12m)/.N, .(bins = cut(mi_pct,c(-Inf,bins,Inf)) %>% addNA)]
-
-setkey(bb, bins)
-
-barplot(height = bb$V1, names.arg = bb$bins)
-
-
-# do other stuff ----------------------------------------------------------
-
-
-# took 142
-a = select_if(as.tibble(sfmdf_2yr), is.numeric) %>% 
-  select(-c(mi_type))
-
-system.time(x <- as.matrix(a))
-
-
-dtrain <- xgb.DMatrix(label = sfmdf_2yr$dh12m, data = x)
-
-
-xgboost(
-  data=x, 
-  label = sfmdf_2yr$dh12m, 
-  nrounds = 2, 
-  objective = "binary:logitraw", 
-  tree_method="exact")
-
-
diff --git a/tutorials/fannie_mae_new/5a_appl_model.r b/tutorials/fannie_mae_new/5a_appl_model.r
deleted file mode 100644
index fb088672..00000000
--- a/tutorials/fannie_mae_new/5a_appl_model.r
+++ /dev/null
@@ -1,53 +0,0 @@
-source("inst/fannie_mae/0_setup.r")
-library(disk.frame)
-
-defaults = disk.frame(file.path(outpath,"defaults.df")) %>%
-  srckeep(c("loan_id", "start_date")) %>% 
-  collect
-
-defaults[,first_default_date := start_date]
-month(defaults$first_default_date) <- month(defaults$first_default_date)+12
-
-defaults = defaults[,.(loan_id, first_default_date)]
-
-acqall = disk.frame(file.path(outpath,"acq_all.df"))
-
-# 1:14
-pt <- proc.time()
-acqall1 <- left_join(
-  acqall, 
-  defaults, 
-  by = "loan_id") %>%
-  delayed(~{
-    library(lubridate)
-    .x[
-      !is.na(first_default_date), 
-      mths_to_1st_default := interval(
-        as.Date(paste0("01/", frst_dte),"%d/%m/%Y"), 
-        first_default_date) 
-      %/% months(1)]
-    
-    .x[,default_next_12m := F]
-    .x[!is.na(first_default_date), default_next_12m := mths_to_1st_default <= 12]
-    
-    res = .x[substr(frst_dte,4,7) < 2016, ]
-    
-    res
-  }) %>% 
-  compute(outdir = file.path(outpath, "appl_mdl_data"), overwrite = T)
-timetaken(pt)
-
-# default rate by year of origination
-system.time(drbyyr <- acqall1 %>% 
-              srckeep(c("frst_dte", "default_next_12m")) %>% 
-              mutate(frst_yr = substr(frst_dte, 4, 7)) %>% 
-              group_by(frst_yr, hard = F) %>% 
-              summarise(ndef = sum(default_next_12m, na.rm=T), n = n()) %>% 
-              collect(parallel  = T) %>% 
-              group_by(frst_yr) %>% 
-              summarise(ndef = sum(ndef), n = sum(n)) %>% 
-              mutate(odr = ndef/n))
-
-drbyyr %>% 
-  ggplot +
-  geom_line(aes(x = frst_yr %>% as.numeric, y = odr))
diff --git a/tutorials/fannie_mae_new/5b_one_var.r b/tutorials/fannie_mae_new/5b_one_var.r
deleted file mode 100644
index 8b796e2e..00000000
--- a/tutorials/fannie_mae_new/5b_one_var.r
+++ /dev/null
@@ -1,189 +0,0 @@
-source("inst/fannie_mae/0_setup.r")
-library(disk.frame)
-
-acqall1 = disk.frame(file.path(outpath, "appl_mdl_data"))
-
-# XGBoost on one var ------------------------------------------------------
-system.time(xy <- acqall1[,c("default_next_12m", "oltv"), keep=c("default_next_12m", "oltv")])
-
-# transform into format accepted by XGBoost
-dtrain <- xgb.DMatrix(label = xy$default_next_12m, data = as.matrix(xy[,"oltv"]))
-
-portfolio_default_rate = xy[,sum(default_next_12m, na.rm = T)/.N]
-
-pt = proc.time()
-m <- xgboost(
-  data=dtrain, 
-  nrounds = 1, 
-  objective = "binary:logitraw", 
-  tree_method="exact",
-  monotone_constraints = 1,
-  base_score = portfolio_default_rate)
-timetaken(pt)
-
-prev_pred = predict(m, dtrain)
-
-map_chr(xgb.dump(m), ~str_extract(.x,"\\[f0<[\\d]+\\.[\\d]+\\]")) %>% 
-  keep(~!is.na(.x)) %>% 
-  map_dbl(~str_extract(.x, "[\\d]+\\.[\\d]+") %>% as.numeric) %>% 
-  sort %>% 
-  floor -> bins
-
-bb = xy[,.(ndef = sum(default_next_12m), .N, m1 = min(oltv), m2 = max(oltv)), .(bins = cut(oltv,c(-Inf,bins,Inf)))]
-new_bins = sort(unique( bb$m2))
-bb = xy[,.(ndef = sum(default_next_12m), .N), .(bins = cut(oltv,c(-Inf,new_bins,Inf)))][order(bins)]
-
-setkey(bb, bins)
-bb %>%
-  filter(!is.na(bins)) %>% 
-  mutate(`Orig LTV Band` = bins, `Default Rate%` = ndef/N) %>% 
-  ggplot +
-  geom_bar(aes(x = `Orig LTV Band`, y = `Default Rate%`), stat = 'identity') + 
-  coord_flip()
-
-if(F) {
-  system.time(xyz <- acqall1[,c("default_next_12m", "oltv", "frst_dte"), keep=c("default_next_12m", "oltv", "frst_dte")])
-  
-  xyz[,frst_yr := substr(frst_dte,4,7) %>% as.integer]
-  xyz[,frst_dte:=NULL]
-  
-  bb = xyz[,.(ndef = sum(default_next_12m), .N), .(bins = cut(oltv,c(-Inf,bins,Inf)), frst_yr)]
-  
-  bb[,binton := sum(N), bins]
-  bb[,dr := ndef/binton]
-  
-  setkey(bb, bins)
-  bb[order(frst_yr,decreasing = T),text_y_pos := cumsum(dr) - dr/2, bins]
-  bb[,text := substr(frst_yr, 3,4)]
-  
-  bb %>%
-    filter(!is.na(bins)) %>% 
-    mutate(`Yr of Orig` = as.factor(frst_yr), `Orig LTV Band` = bins, `Default Rate%` = dr) %>% 
-    ggplot +
-    geom_bar(aes(x = `Orig LTV Band`, y = `Default Rate%`, fill = `Yr of Orig`), stat = 'identity') + 
-    geom_text(aes(x = `Orig LTV Band`, y = text_y_pos, label = text)) +
-    coord_flip()
-}
-
-
-# what if we fit them in rounded number ---------------------------------------------------------
-
-xy[oltv >  80, oltv_round := ceiling(oltv/5)*5]
-xy[oltv <= 80, oltv_round := ceiling(oltv/10)*10]
-xy[oltv <= 40, oltv_round := ceiling(oltv/20)*20]
-#xy = xy[!is.na(oltv_round),]
-
-xy[is.na(oltv_round),]
-
-xy[,.N, oltv_round]
-
-# transform into format accepted by XGBoost
-dtrain <- xgb.DMatrix(label = xy$default_next_12m, data = as.matrix(xy[,"oltv_round"]))
-
-pt = proc.time()
-m <- xgboost(
-  data=dtrain, 
-  nrounds = 1, 
-  objective = "binary:logitraw", 
-  tree_method="exact",
-  monotone_constraints = 1,
-  base_score = portfolio_default_rate)
-timetaken(pt)
-
-map_chr(xgb.dump(m), ~str_extract(.x,"\\[f0<[\\d]+[\\.]{0,1}[\\d]+\\]")) %>% 
-  keep(~!is.na(.x)) %>% 
-  map_dbl(~str_extract(.x, "[\\d]+[\\.]{0,1}[\\d]+") %>% as.numeric) %>% 
-  sort -> bins
-
-bb = xy[,.(ndef = sum(default_next_12m), .N, m1 = min(oltv_round), m2 = max(oltv_round)), .(bins = cut(oltv_round,c(-Inf,bins,Inf)))]
-new_bins = sort(unique( bb$m2))
-bb = xy[,.(ndef = sum(default_next_12m), .N), .(bins = cut(oltv_round,c(-Inf,new_bins,Inf)))][order(bins)]
-bb[,odr := ndef/N]
-
-setkey(bb, bins)
-bb %>%
-  filter(!is.na(bins)) %>% 
-  mutate(`Orig LTV Band` = bins, `Default Rate%` = ndef/N) %>% 
-  ggplot +
-  geom_bar(aes(x = `Orig LTV Band`, y = `Default Rate%`), stat = 'identity') + 
-  coord_flip()
-
-
-prev_pred = predict(m, dtrain)
-
-prev_pred1 = predict(m, dtrain, predcontrib=T)
-
-# add another variable ----------------------------------------------------
-target = "default_next_12m"
-feature = "orig_amt"
-df = acqall1
-format_fn = base::I
-existing_model = prev_pred
-monotone_constraints = -1
-
-auc <- function(target, score) {
-  df = data.table(target, score)
-  
-  df1 = df[,.(nt = sum(target), n = .N, score)]
-  setkey(df1, score)
-}
-
-add_var_to_scorecard <- function(df, target, feature, monotone_constraints = 0, prev_pred = NULL, format_fn = base::I) {
-  
-  xy = df %>% 
-    srckeep(c(target, feature)) %>%
-    collect(parallel = T)
-  
-  # evaluate
-  code = glue::glue("xy = xy %>% mutate({feature} = format_fn({feature}))")
-  eval(parse(text = code))
-  
-  dtrain <- xgb.DMatrix(label = xy[,target, with = F][[1]], data = as.matrix(xy[,c(feature), with = F]))
-  
-  if(is.null(prev_pred)) {
-    pt = proc.time()
-    m2 <- xgboost(
-      data=dtrain, 
-      nrounds = 1, 
-      objective = "binary:logitraw", 
-      tree_method="exact",
-      monotone_constraints = monotone_constraints
-    )
-    timetaken(pt)
-  } else {
-    setinfo(dtrain, "base_margin", prev_pred)
-    pt = proc.time()
-    m2 <- xgboost(
-      data=dtrain, 
-      nrounds = 1, 
-      objective = "binary:logitraw", 
-      tree_method="exact",
-      monotone_constraints = monotone_constraints
-    )
-    timetaken(pt)
-    
-    a2 = predict(m2, dtrain)
-    a3 = predict(m2, dtrain, predcontrib = T)
-  }
-  
-  map_chr(xgb.dump(m2), ~str_extract(.x,"\\[f0<[\\d]+[\\.]{0,1}[\\d]+\\]")) %>% 
-    keep(~!is.na(.x)) %>% 
-    map_dbl(~str_extract(.x, "[\\d]+[\\.]{0,1}[\\d]+") %>% as.numeric) %>% 
-    sort -> bins
-  
-  code = glue::glue("bb = xy[,.(ndef = sum({target}), .N, m1 = min({feature}), m2 = max({feature})), .(bins = cut({feature},c(-Inf,bins,Inf)))]")
-  eval(parse(text = code))
-  
-  new_bins = sort(unique(bb$m2))
-  code1 = glue::glue("bb = xy[,.(ndef = sum(default_next_12m), .N), .(bins = cut({feature},c(-Inf,new_bins,Inf)))][order(bins)]")
-  eval(parse(text = code1))
-  
-  setkey(bb, bins)
-  bb %>%
-    filter(!is.na(bins)) %>% 
-    mutate(`Orig LTV Band` = bins, `Default Rate%` = ndef/N) %>% 
-    ggplot +
-    geom_bar(aes(x = `Orig LTV Band`, y = `Default Rate%`), stat = 'identity') + 
-    coord_flip()
-}
-
diff --git a/tutorials/fannie_mae_new/5c_using_a_fn.r b/tutorials/fannie_mae_new/5c_using_a_fn.r
deleted file mode 100644
index e083fa54..00000000
--- a/tutorials/fannie_mae_new/5c_using_a_fn.r
+++ /dev/null
@@ -1,54 +0,0 @@
-source("inst/fannie_mae/0_setup.r")
-library(disk.frame)
-
-acqall1 = disk.frame(file.path(outpath, "appl_mdl_data"))
-
-pt <- proc.time()
-firstm <- add_var_to_scorecard(acqall1, "default_next_12m", "oltv", monotone_constraints = 1, format_fn = function(v) {
-  ceiling(v / 5) * 5
-})
-timetaken(pt)
-
-# now add the second variable ---------------------------------------------
-pt <- proc.time()
-secondm <- add_var_to_scorecard(
-  acqall1, 
-  "default_next_12m", 
-  "dti", 
-  prev_pred = firstm$prev_pred, 
-  monotone_constraints = 1)
-timetaken(pt)
-
-pt <- proc.time()
-thirdm <- add_var_to_scorecard(
-  acqall1, 
-  "default_next_12m", 
-  "orig_trm", 
-  prev_pred = secondm$prev_pred, 
-  monotone_constraints = 1)
-timetaken(pt)
-
-pt <- proc.time()
-fourthm <- add_var_to_scorecard(
-  acqall1, 
-  "default_next_12m", 
-  "num_bo", 
-  prev_pred = thirdm$prev_pred, 
-  monotone_constraints = -1,
-  format_fn = as.numeric)
-timetaken(pt)
-
-pt <- proc.time()
-fifthm <- add_var_to_scorecard(
-  acqall1, 
-  "default_next_12m", 
-  "cscore_b", 
-  prev_pred = fourthm$prev_pred, 
-  monotone_constraints = -1)
-timetaken(pt)
-
-
-# plot improvement --------------------------------------------------------
-plot(map_dbl(list(firstm, secondm, thirdm, fourthm, fifthm), ~.x$auc), type="b", ylim = c(0.4,0.9))
-abline(h = 0.7, lty = 2)
-
diff --git a/tutorials/fannie_mae_new/5d_AutoML.r b/tutorials/fannie_mae_new/5d_AutoML.r
deleted file mode 100644
index 7525c879..00000000
--- a/tutorials/fannie_mae_new/5d_AutoML.r
+++ /dev/null
@@ -1,34 +0,0 @@
-source("inst/fannie_mae/0_setup.r")
-library(disk.frame)
-acqall = disk.frame(file.path(outpath, "appl_mdl_data"))
-
-pt = proc.time()
-acqall_all = acqall %>%
-  map.disk.frame(~{
-    defs = .x[default_next_12m == T,]
-    defs[,weight:=1]
-    non_defs = .x[default_next_12m == F, ]
-    non_defs[,weight:=10]
-    
-    rbindlist(list(defs, sample_frac(non_defs, 0.1)), fill = T, use.names = T)
-  }, lazy = F, outdir = file.path(outpath, "appl_mdl_data_sampled"), overwrite = T)
-timetaken(pt)
-
-#acqall_all = disk.frame(file.path(outpath, "appl_mdl_data_sampled"))
-
-acqall_dev = sample_frac(acqall_all, 0.7) %>% 
-  write_disk.frame(file.path(outpath, "appl_mdl_data_sampled_dev"))
-
-#acqall_dev = disk.frame(file.path(outpath, "appl_mdl_data_sampled_dev"))
-
-
-uid_dev = acqall_dev %>% 
-  srckeep("loan_id") %>% 
-  mutate(loan_id = unique(loan_id)) %>% 
-  collect(parallel = T) %>% 
-  mutate(loan_id = unique(loan_id))
-  
-
-acqall_val = acqall_all %>% 
-  anti_join(uid_dev, by = "loan_id") %>% 
-  write_disk.frame(file.path(outpath, "appl_mdl_data_sampled_val"))
diff --git a/tutorials/fannie_mae_new/5e_AutoML_actual.r b/tutorials/fannie_mae_new/5e_AutoML_actual.r
deleted file mode 100644
index 9e7db972..00000000
--- a/tutorials/fannie_mae_new/5e_AutoML_actual.r
+++ /dev/null
@@ -1,90 +0,0 @@
-source("inst/fannie_mae/0_setup.r")
-library(disk.frame)
-
-acqall_dev = disk.frame(file.path(outpath, "appl_mdl_data_sampled_dev2"))
-  
-
-# 5d develop a function to test all variables
-check_which_is_best <- function(df, target, features, monotone_constraints, format_fns, weight=NULL) {
-  #browser()
-  prev_pred = NULL
-  ws = NULL
-  vars_scr = NULL
-  
-  while(length(features) > 0) {
-    # try every var
-    #browser()
-    res = furrr::future_map(1:length(features), ~add_var_to_scorecard(
-    #res = purrr::map(1:length(features), ~add_var_to_scorecard(
-      df, 
-      target,
-      features[.x], 
-      monotone_constraints = monotone_constraints[.x],
-      prev_pred = prev_pred,
-      format_fn = format_fns[[.x]],
-      weight,
-      save_model_fname = glue::glue("{features[.x]}.xgbm")
-    ))
-    
-    # which has the best auc
-    w = which.max(map_dbl(res, ~.x$auc))
-    ws = c(ws, w)
-    feature_to_fit = features[w]
-    print(glue::glue("choosen: {feature_to_fit}"))
-    
-    var_scr1 = res[[w]]
-    eval(parse(text = glue::glue("vars_scr = c(vars_scr, list({feature_to_fit} = var_scr1))")))
-    
-    features = features[-w]
-    format_fns = format_fns[-w]
-    monotone_constraints = monotone_constraints[-w]
-    
-    prev_pred = var_scr1$prev_pred
-  }
-  vars_scr
-}
-
-num_vars =     c("mi_pct", "orig_amt", "orig_rt", "ocltv", "mi_type", "cscore_c", "oltv", "dti", "orig_trm", "cscore_b", "num_bo", "num_unit", "orig_dte", "frst_dte")
-num_vars_mon = c(0       ,  1        , 1        , 1      , 0        ,  -1        , 1     , 1    ,     1     ,  -1        ,  -1     , 0         ,       0   ,  0)
-num_var_fmt_fn = c(map(1:10, ~base::I), map(1:2, ~as.numeric), map(1:2, ~function(x) {
-  as.numeric(substr(x, 4, 7))
-}))
-
-cat_vars = setdiff(names(acqall_dev), num_vars) %>% 
-  setdiff(
-    c("loan_id", "default_next_12m", "first_default_date", "mths_to_1st_default", "zip_3", "weight"))
-
-pt = proc.time()
-res_all = check_which_is_best(
-  acqall_dev, 
-  "default_next_12m", 
-  features = c(num_vars, cat_vars),
-  monotone_constraints = c(num_vars_mon, rep(0, length(cat_vars))),
-  format_fns = c(num_var_fmt_fn, map(1:length(cat_vars), ~base::I)),
-  weight = "weight"
-)
-timetaken(pt)
-
-plot(map_dbl(res_all, ~.x$auc))
-
-saveRDS(res_all, "model.rds")
-
-if(F) {
-  rescat = check_which_is_best(
-    acqall_dev, 
-    "default_next_12m", 
-    c("seller.name", "oltv", "state", "dti", "mi_type"),
-    c(0            , 1     , 0      , 1    , 0     ),
-    map(1:5, ~base::I)
-  )
-  
-  
-  system.time(res <- check_which_is_best(
-    acqall1, 
-    "default_next_12m", 
-    c("ocltv", "cscore_c", "mi_type", "oltv", "dti", "orig_trm", "cscore_b", "num_bo"),
-    c(1      , -1        , 0        , 1     , 1    ,     1     , -1        ,  -1     ),
-    c(map(1:7, ~base::I), list(as.numeric))))
-}
-
-
diff --git a/tutorials/fannie_mae_new/5f_make_scorecard_visible.R b/tutorials/fannie_mae_new/5f_make_scorecard_visible.R
deleted file mode 100644
index 2d696a29..00000000
--- a/tutorials/fannie_mae_new/5f_make_scorecard_visible.R
+++ /dev/null
@@ -1,94 +0,0 @@
-source("inst/fannie_mae/0_setup.r")
-library(disk.frame)
-
-acqall_val = disk.frame(file.path(outpath, "appl_mdl_data_sampled_val"))
-
-#the model
-mdl = readRDS("model.rds")
-
-plot(purrr::map_dbl(mdl, ~.x$auc))
-
-#' score one variable
-score_one_var <- function(feature, x, bins, bias, format_fn = base::I) {
-  print(feature)
-  # numeric features (post format_fn) has only two columns in bins
-  if(ncol(bins) == 2) {
-    x1 = format_fn(x)
-    binsNA = evalparseglue("bins[is.na({feature}),]")
-    bins = evalparseglue("bins[!is.na({feature}),]")
-    score = bins$score[cut(x1, c(-Inf, bins[[2]], Inf))]
-    
-    if(nrow(binsNA) == 1) {
-      score[is.na(x1)] = binsNA[[1]]
-    }
-  } else {
-    x1 = format_fn(x)
-    x1df = evalparseglue("data.table({feature} = x1)")
-    x1df_scr = evalparseglue("merge(x1df, bins, by = '{feature}', all.x=T)")
-    score = x1df_scr$score
-  }
-  # if there are still na then assign the bias
-  #browser(expr = any(is.na(score)))
-  if(any(is.na(score))) {
-    warning(glue::glue("the feature `{feature}` has {sum(is.na(score))} unassigned scores out of {length(score)}, assigning them the bias = {bias}"))
-    score[is.na(score)] = bias
-  }
-  
-  score
-}
-
-
-#' Socre a scorecard
-#' @param mdl the xg scorecard model
-#' @param df a disk.frame
-score_xg_scorecard <- function(df, mdl) {
-  res = furrr::future_map_dfc(mdl, ~{
-  #res = purrr::map_dfc(mdl, ~{
-    var = df %>% 
-      srckeep(.x$feature) %>% 
-      collect(parallel = F)
-    x = var[[1]]
-    feature = .x$feature
-    bins = .x$bins
-    bias = .x$bias
-    format_fn = .x$format_fn
-    score_one_var(feature, x, bins, bias, format_fn)
-  }) %>% rowSums
-  res
-}
-
-plot_auc <- function(df) {
-  target = df %>% 
-    srckeep("default_next_12m") %>% 
-    collect(parallel = F)
-  
-  pt = proc.time()
-  target$score = score_xg_scorecard(df, mdl)
-  timetaken(pt)
-  
-  AUC = auc(target$default_next_12m, target$score)
-  GINI = 2*AUC-1
-  
-  target[,negscore := -score]
-  setkey(target, negscore)
-  cts = target[,.(ctot = (1:.N)/.N, cbad = cumsum(default_next_12m)/sum(default_next_12m))]
-  
-  plot(cts[seq(1,.N, length.out=100),], type="l", xlim=c(0,1),
-       main=glue::glue("On Validation - AUC: {AUC %>% round(2)}; GINI: {GINI %>% round(2)}"))
-  abline(a=0,b=1,lty=2)
-  abline(h=0)
-  abline(v=0)
-}
-
-system.time(plot_auc(acqall_val))
-
-# score on whole ----------------------------------------------------------
-acqall_dev = disk.frame(file.path(outpath, "appl_mdl_data_sampled_dev"))
-system.time(plot_auc(acqall_dev))
-
-
-# score on whole ----------------------------------------------------------
-acqall = disk.frame(file.path(outpath, "appl_mdl_data"))
-system.time(plot_auc(acqall))
-
-
diff --git a/tutorials/fannie_mae_new/6a_origination_year.r b/tutorials/fannie_mae_new/6a_origination_year.r
deleted file mode 100644
index 8115a782..00000000
--- a/tutorials/fannie_mae_new/6a_origination_year.r
+++ /dev/null
@@ -1,31 +0,0 @@
-
-acqall2 = acqall1 %>%
-  srckeep(c("orig_dte", "first_default_date")) %>% 
-  mutate(orig_yr = substr(orig_dte, 4,7), yr_1st_d = year(first_default_date)) %>% 
-  group_by(orig_yr, yr_1st_d, hard = F) %>% 
-  summarise(n=n()) %>% 
-  collect %>% 
-  group_by(orig_yr, yr_1st_d) %>% 
-  summarise(n = sum(n))
-
-acqall2[,tot_n := sum(n), orig_yr]
-
-acqall3 <- acqall2[!is.na(yr_1st_d),]
-
-acqall3[,dr := n/tot_n]
-
-acqall3 %>% 
-  filter(orig_yr > 1999) %>% 
-  mutate(`Origination Year` = orig_yr) %>% 
-  ggplot +
-  geom_line(aes(x = yr_1st_d, y = dr, colour = `Origination Year`)) +
-  xlab("Year of Observation") +
-  ylab("Ratio of defaulted accounts vs # of accts at orig") +
-  scale_x_continuous(breaks=2000:2017, labels=as.character(2000:2017)) + 
-  scale_y_continuous(expand = c(0, 0)) + 
-  ggtitle("Fannie Mae Single Family Loans: Ratio of defaults vs # of accounts in same year of origination")
-
-
-
-
-
diff --git a/tutorials/fannie_mae_new/7a_speedglm.r b/tutorials/fannie_mae_new/7a_speedglm.r
deleted file mode 100644
index 3c3fe0c6..00000000
--- a/tutorials/fannie_mae_new/7a_speedglm.r
+++ /dev/null
@@ -1,75 +0,0 @@
-source("inst/fannie_mae/0_setup.r")
-library(disk.frame)
-
-acqall_dev = disk.frame(file.path(outpath, "appl_mdl_data_sampled_dev2"))
-
-library(speedglm)
-
-head(acqall_dev)
-
-#' A streaming function for speedglm
-#' @param df a disk.frame
-stream_shglm <- function(df) {
-  i = 0
-  is = sample(nchunks(df), replace = F)
-  function(reset = F) {
-    #browser()
-    if(reset) {
-      print("you've reset")
-      i <<- 0
-    } else {
-      i <<- i + 1
-      #if (i > 4) {
-      if (i > nchunks(df)) {
-        return(NULL)
-      }
-      print(glue::glue("{i}/{nchunks(df)}"))
-      return(get_chunk(df, is[i]))
-      #return(get_chunk(df, i))
-    }
-  }
-}
-
-acqall_dev1 = acqall_dev %>% delayed(~{
-  .x[,oltv_band := cut(oltv, c(-Inf, seq(0,100,by=10)))]
-  .x[,dti_band := addNA(cut(dti, c(-Inf, seq(0,100,by=10), Inf)))]
-  .x
-})
-
-library(speedglm)
-library(biglm)
-
-shglm(default_next_12m ~ oltv_band-1, 
-      datafun = streamacq, family=binomial())
-
-bigglm(default_next_12m ~ oltv_band-1, data = streamacq, family=binomial())
-
-make.data<-function(filename, chunksize,...){
-  conn<-NULL
-  function(reset=FALSE){
-    if(reset){
-      if(!is.null(conn)) 
-        close(conn)
-      conn<<-file(filename,open="r")} else{
-        rval<-read.table(conn, nrows=chunksize,...)
-        if ((nrow(rval)==0)) {
-          close(conn)
-          conn<<-NULL
-          rval<-NULL}
-        return(rval)}}}
-# data1 is a small toy dataset
-data(data1)
-write.table(data1,"data1.txt",row.names=FALSE,col.names=FALSE)
-rm(data1)
-da<-make.data("data1.txt",chunksize=50,col.names=c("y","fat1","x1","x2"))
-# Caution! make sure to close the connection once you have run command #1
-da(reset=T) #1: opens the connection to "data1.txt"
-da(reset=F) #2: reads the first 50 rows (out of 100) of the dataset
-da(reset=F) #3: reads the second 50 rows (out of 100) of the dataset
-da(reset=F) #4: is NULL: this latter command closes the connectionrequire(biglm)# fat1 is a factor with four levels
-
-b1<-shglm(y~factor(fat1)+x1,weights=~I(x2^2),datafun=da,family=Gamma(log))
-b2<-bigglm(y~factor(fat1)+x1,weights=~I(x2^2),data=da,family=Gamma(log))
-summary(b1)
-summary(b2)
-
diff --git a/tutorials/fannie_mae_new/8a_keras.r b/tutorials/fannie_mae_new/8a_keras.r
deleted file mode 100644
index 1fc7dcda..00000000
--- a/tutorials/fannie_mae_new/8a_keras.r
+++ /dev/null
@@ -1,153 +0,0 @@
-source("inst/fannie_mae/0_setup.r")
-library(disk.frame)
-library(keras)
-
-acqall_dev = disk.frame(file.path(outpath, "appl_mdl_data_sampled_dev"))
-
-#' A streaming function for speedglm
-#' @param df a disk.frame
-stream_shglm <- function(df) {
-  i = 0
-  is = sample(nchunks(df), replace = F)
-  function(reset = F) {
-    #browser()
-    if(reset) {
-      print("you've reset")
-      i <<- 0
-    } else {
-      i <<- i + 1
-      #if (i > 4) {
-      if (i > nchunks(df)) {
-        return(NULL)
-      }
-      print(glue::glue("{i}/{nchunks(df)}"))
-      return(get_chunk(df, is[i]))
-      #return(get_chunk(df, i))
-    }
-  }
-}
-
-acqall_dev1 = acqall_dev %>% delayed(~{
-  .x[,oltv_band := cut(oltv, c(-Inf, seq(40,100,by=20)))]
-  .x[,dti_band := addNA(cut(dti, c(-Inf, seq(0,64,by=8))))]
-  .x
-})
-
-
-build_model <- function() {
-  model <- keras_model_sequential() %>%
-    layer_dense(units = 2, activation = 'softmax')
-  
-  
-  model %>% compile(
-    loss = "categorical_crossentropy",
-    optimizer = 'sgd'
-  )
-  
-  model
-}
-
-model = build_model()
-
-ol = levels(acqall_dev1[,oltv_band, keep=c("oltv","dti")])
-dl = levels(acqall_dev1[,dti_band, keep=c("oltv","dti")])
-
-#for(i in 1:nchunks(acqall_dev1)) {
-kk <- function() {
-  j = 0
-  done = F
-  ii = 0
-  while(!done) {
-    j <- j + 1
-    si = sample(nchunks(acqall_dev1), nchunks(acqall_dev1)*0.7)
-    osi = setdiff(1:nchunks(acqall_dev1), si)
-    #browser()
-    system.time(a <- map_dfr(si, ~{
-      ii <- ii + 1
-      i = .x
-      if(ii %% 20 == 0) print(glue::glue("{j}:{ii} {Sys.time()}"))
-      
-      a = get_chunk(acqall_dev1, i)
-      a[,.(sum(default_next_12m), .N),oltv_band]
-      
-      a1 = 
-        cbind(
-          a[,keras::to_categorical(as.integer(oltv_band) - 1)]#,
-          #a[,keras::to_categorical(as.integer(dti_band) - 1)]
-        )
-      
-      at = a[,default_next_12m*1]
-      Y_train = keras::to_categorical(at)
-      
-      hist = model %>% fit(
-        a1,
-        Y_train,
-        epochs = 1,
-        validation_split = 0.2,
-        verbose = 0
-      )
-      
-      gw = get_weights(model)
-      
-      gwdt = as.data.table(gw[1])
-      setnames(gwdt, names(gwdt), c("non_default", "default"))
-      gwdt$i =i
-      gwdt$var = c(
-        rep("oltv_band", length(ol))
-        #, rep("dti_band", length(dl)+1)
-      )
-      gwdt$band = c(ol#,dl
-      )
-      gwdt
-    }))
-    
-    some_chunks <- map(osi, ~{
-      i = .x
-      a = get_chunk(acqall_dev1, i)
-      
-      a1 = 
-        cbind(
-          a[,keras::to_categorical(as.integer(oltv_band) -1)]
-          #,a[,keras::to_categorical(dti_band %>% as.integer-1)]
-        )
-      a1
-    }) %>% reduce(rbind)
-    
-    outcomes <- map(osi, ~{
-      i = .x
-      a = get_chunk(acqall_dev, i, keep=c("default_next_12m"))
-      a[,default_next_12m]
-    }) %>% unlist
-    
-    auc= auc(outcomes, predict(model, some_chunks)[,2])
-    
-    # scores
-    p = predict(model, diag(4))[,2]
-    a_b = log(p/(1-p))
-    
-    scalar = 20/log(2)
-    intercept = 20*log(536870912/15)/log(2)
-    
-    a = mean(a_b)
-    b = a_b - a
-    done <- length(unique(sign(diff(b)))) == 1
-    
-    scrs = data.table(base_scr = round(a*scalar+intercept,0), scr = round(-b*scalar,0))
-    scrs$var = c(rep("oltv_band", length(ol))
-                 #, rep("dti_band", length(dl)+1)
-    )
-    scrs$band = c(ol
-                  #, dl
-    )
-    scrs = scrs[band != "bias"]
-    
-    scrs = scrs[,.(var, band, scr, base_scr)]
-    print(glue::glue("AUC: {auc}"))
-    print(scrs)
-  }  
-}
-
-pt = proc.time()
-kk()
-timetaken(pt)
-View(scrs)
diff --git a/tutorials/flights-100-1000.r b/tutorials/flights-100-1000.r
deleted file mode 100644
index da2f528f..00000000
--- a/tutorials/flights-100-1000.r
+++ /dev/null
@@ -1,49 +0,0 @@
-library(data.table)
-library(disk.frame)
-setup_disk.frame()
-
-bench_disk.frame_data.table_group_by <- function(data1,n) {
-  a.sharded.df = as.disk.frame(data1, shardby = c("year", "month", "day"))
-  
-  a.not_sharded.df = as.disk.frame(data1)
-  
-  a = copy(data1)
-  setDT(a)
-  
-  data.table_timing = system.time(a[,.(mean_dep_time = mean(dep_time, na.rm=T)), .(year, month, day)])[3]
-  
-  disk.frame_sharded_timing = system.time(
-    a.sharded.df[
-      ,
-      .(mean_dep_time = mean(dep_time, na.rm=TRUE)), 
-      .(year, month, day),
-      keep = c("year", "month","day", "dep_time")])[3]
-  
-  
-  disk.frame_not_sharded_timing = system.time(
-    a.not_sharded.df[
-      ,
-      .(
-        sum_dep_time = sum(dep_time, na.rm=TRUE),
-        n = sum(!is.na(dep_time))
-      ), 
-      .(year, month, day),
-      keep = c("year", "month","day", "dep_time")][
-        ,
-        .(mean_dep_time = sum(sum_dep_time)/sum(n)),
-        .(year, month, day)
-        ])[3]
-  
-  barplot(
-    c(data.table_timing, disk.frame_sharded_timing, disk.frame_not_sharded_timing),
-    names.arg = c("data.table", "sharded disk.frame", "not sharded disk.frame"), 
-    main = glue:glue("flights duplicated {n}  times group-by year, month, day"),
-    ylab = "Seconds")
-}
-
-system.time(flights_100 <- rbindlist(lapply(1:100, function(x) nycflights13::flights)))
-
-system.time(flights_1000 <- rbindlist(lapply(1:10, function(x) flights_100)))
-
-bench_disk.frame_data.table_group_by(flights_100, 100)
-bench_disk.frame_data.table_group_by(flights_1000, 1000)
diff --git a/tutorials/flights_case_study/0-download-data.sh b/tutorials/flights_case_study/0-download-data.sh
deleted file mode 100644
index ba7b1350..00000000
--- a/tutorials/flights_case_study/0-download-data.sh
+++ /dev/null
@@ -1,7 +0,0 @@
-#!/bin/bash
-
-cd inst/flights_case_study/data/
-
-for i in {1987..2008}; do wget -q http://stat-computing.org/dataexpo/2009/"$i".csv.bz2; done
-
-for i in {1987..2008}; do bunzip2 "$i".csv.bz2;  done
diff --git a/tutorials/flights_case_study/1-make-disk-frame.r b/tutorials/flights_case_study/1-make-disk-frame.r
deleted file mode 100644
index 19e182b0..00000000
--- a/tutorials/flights_case_study/1-make-disk-frame.r
+++ /dev/null
@@ -1,39 +0,0 @@
-  library(disk.frame)
-  
-  pt = proc.time()
-  non_shard = 2007:2008 %>% 
-    purrr::map(~ csv_to_disk.frame(paste0("inst/flights_case_study/data/",.x,".csv"), 
-                                   outdir=paste0("tmp",.x),
-                                   overwrite=T)) %>% 
-    rbindlist.disk.frame(
-      outdir = "inst/data/flights_all_no_shard.df", by_chunk_id = T)
-  
-  
-  sharded = 2007:2008 %>% 
-    purrr::map(~ csv_to_disk.frame(paste0("inst/flights_case_study/data/",.x,".csv"), 
-                                   outdir=paste0("tmp2",.x),
-                                   shardby=c("Year","Month"),
-                                   overwrite=T)) %>% 
-    rbindlist.disk.frame(
-      outdir = "inst/data/flights_all_shard.df", by_chunk_id = T)
-  
-  
-  
-  nrow(non_shard)
-  nrow(sharded)
-  
-  library(disk.frame)
-  sharded = disk.frame("inst/data/flights_all_shard.df")
-  a = sharded[,.N, UniqueCarrier, keep="UniqueCarrier"]
-  
-  a = a[,sum(N), UniqueCarrier]
-  
-  nosharded = disk.frame("inst/data/flights_all_no_shard.df/")
-  a1 = nosharded[,.N, UniqueCarrier, keep="UniqueCarrier"]
-  a1 = a1[,sum(N), UniqueCarrier]
-  
-  setkey(a, UniqueCarrier)
-  setkey(a1, UniqueCarrier)
-  
-  identical(a, a1)
-  
diff --git a/tutorials/flights_case_study/2-shard-check.r b/tutorials/flights_case_study/2-shard-check.r
deleted file mode 100644
index 5ae36ddc..00000000
--- a/tutorials/flights_case_study/2-shard-check.r
+++ /dev/null
@@ -1,46 +0,0 @@
-library(disk.frame)
-plan(multiprocess)
-library(magrittr)
-
-a = list.files("inst/flights_case_study/data", 
-           pattern="*.csv", 
-           full.names = T) %>% 
-  head(10) %>%  # only 10 files so that we can test faster
-  purrr::map(~ csv_to_disk.frame(.x, 
-                                 outdir=paste0("tmp", readr::parse_number(.x)),
-                                 shardby = c("Year","Month"),
-                                 overwrite=T)) %>% 
-  rbindlist.disk.frame(outdir = "flights_all_sharded.df", by_chunk_id = T)
-
-
-b = list.files("inst/flights_case_study/data", 
-           pattern="*.csv", 
-           full.names = T) %>% 
-  head(10) %>%  # only 10 files so that we can test faster
-  purrr::map(~ csv_to_disk.frame(.x, 
-                                 outdir=paste0("tmp", readr::parse_number(.x)),
-                                 nchunks = 128,
-                                 overwrite=T)) %>% 
-  rbindlist.disk.frame(outdir = "flights_all_notsharded.df", by_chunk_id = T)
-
-
-aa = a[,.N, UniqueCarrier, keep=c("UniqueCarrier")][,sum(N), UniqueCarrier][order(UniqueCarrier)]
-
-bb = b[,.N, UniqueCarrier, keep=c("UniqueCarrier")][,sum(N), UniqueCarrier][order(UniqueCarrier)]
-
-identical(aa,bb)
-
-a2 = a %>% 
-  group_by(UniqueCarrier) %>% 
-  summarise(n=n()) %>% 
-  collect %>% 
-  group_by(UniqueCarrier) %>% 
-  summarise(n=sum(n)) %>% 
-  order_by(UniqueCarrier)
-
-b2 = b %>% group_by(UniqueCarrier) %>% 
-  summarise(n=count()) %>% 
-  collect %>% 
-  group_by(UniqueCarrier) %>% 
-  summarise(n=sum(n))
-
diff --git a/tutorials/flights_case_study/2-shard-check2.r b/tutorials/flights_case_study/2-shard-check2.r
deleted file mode 100644
index b4b97162..00000000
--- a/tutorials/flights_case_study/2-shard-check2.r
+++ /dev/null
@@ -1,31 +0,0 @@
-library(disk.frame)
-plan(multiprocess)
-library(magrittr)
-
-a = disk.frame("flights_all_sharded.df")
-b = disk.frame("flights_all_notsharded.df")
-
-a2 = a %>% 
-  group_by(UniqueCarrier) %>% 
-  summarise(n=n()) %>% 
-  collect %>% 
-  group_by(UniqueCarrier) %>% 
-  summarise(n=sum(n)) %>% 
-  arrange(UniqueCarrier)
-
-b2 = b %>% 
-  group_by(UniqueCarrier) %>% 
-  summarise(n=n()) %>% 
-  collect %>% 
-  group_by(UniqueCarrier) %>% 
-  summarise(n=sum(n)) %>% 
-  arrange(UniqueCarrier)
-
-identical(a2, b2)
-
-fa = list.files("inst/flights_case_study/data", 
-           pattern="*.csv", 
-           full.names = T) %>% 
-  head(10)
-
-ab = csv_to_disk.frame(fa, outdir ="pls_del_tmp", overwrite = T)
diff --git a/tutorials/flights_case_study/testing.r b/tutorials/flights_case_study/testing.r
deleted file mode 100644
index b47cdee6..00000000
--- a/tutorials/flights_case_study/testing.r
+++ /dev/null
@@ -1,67 +0,0 @@
-library(disk.frame)
-library(furrr)
-library(magrittr)
-plan(multiprocess)
-
-# fs::dir_delete("tmp22007")
-# fs::dir_delete("tmp22008")
-# fs::dir_delete("tmp32007")
-# fs::dir_delete("tmp32008")
-# fs::dir_delete("tmp2007")
-# fs::dir_delete("tmp2008")
-
-pt = proc.time()
-full = 1987:2008 %>% 
-  furrr::future_map(~ csv_to_disk.frame(paste0("inst/flights_case_study/data/",.x,".csv"), 
-                                 outdir=paste0("tmp3",.x),
-                                 overwrite=T)) %>% 
-  rbindlist.disk.frame(
-    outdir = "inst/data/flights_all_no_shard.df", by_chunk_id = T)
-print(timetaken(pt))
-
-cnames = names(full)
-
-pt = proc.time()
-non_shard = 1987:2008 %>% 
-  furrr::future_map(~ csv_to_disk.frame(paste0("inst/flights_case_study/data/",.x,".csv"), 
-                                 outdir=paste0("tmp",.x),
-                                 in_chunk_size = 3e6,
-                                 overwrite=T)) %>% 
-  rbindlist.disk.frame(
-    outdir = "inst/data/flights_all_no_shard.df", by_chunk_id = T)
-print(timetaken(pt))
-  
-pt = proc.time()
-sharded = 1987:2008 %>% 
-  purrr::map(~ csv_to_disk.frame(paste0("inst/flights_case_study/data/",.x,".csv"), 
-                                 outdir=paste0("tmp2",.x),
-                                 shardby=c("Year","Month"),
-                                 col.names = cnames,
-                                 in_chunk_size = 3e6,
-                                 overwrite=T)) %>% 
-  rbindlist.disk.frame(
-    outdir = "inst/data/flights_all_shard.df", by_chunk_id = T)
-print(timetaken(pt))
-
-nrow(full)
-nrow(non_shard)
-nrow(sharded)
-
-library(disk.frame)
-# sharded = disk.frame("inst/data/flights_all_shard.df")
-a = sharded[,.N, UniqueCarrier, keep="UniqueCarrier"]
-
-a = a[,sum(N), UniqueCarrier]
-
-# non_shard = disk.frame("inst/data/flights_all_no_shard.df/")
-a1 = non_shard[,.N, UniqueCarrier, keep="UniqueCarrier"]
-a1 = a1[,sum(N), UniqueCarrier]
-
-setkey(a, UniqueCarrier)
-setkey(a1, UniqueCarrier)
-
-identical(a, a1)
-
-a2 = merge(a,a1, by="UniqueCarrier")
-
-a2[,all(V1.x == V1.y)]
diff --git a/tutorials/flights_case_study/testing2.r b/tutorials/flights_case_study/testing2.r
deleted file mode 100644
index d12f0aeb..00000000
--- a/tutorials/flights_case_study/testing2.r
+++ /dev/null
@@ -1,31 +0,0 @@
-#install.packages("tidyverse")
-library(readr)
-library(disk.frame)
-library(data.table)
-library(future)
-library(magrittr)
-plan(multiprocess)
-
-pt=proc.time()
-a = list.files("inst/flights_case_study/data/", 
-           pattern="*.csv", 
-           full.names = T) %>% 
-  head(10) %>%  # only 10 files so that we can test faster
-  purrr::map(~ csv_to_disk.frame(.x, 
-                                 outdir=paste0("tmp", readr::parse_number(.x)),
-                                 nchunks = 128,
-                                 overwrite=T)) %>% 
-  rbindlist.disk.frame(outdir = "flights_all_notsharded.df", by_chunk_id = T)
-print(timetaken(pt))
-
-pt=proc.time()
-b = list.files("inst/flights_case_study/data/", 
-           pattern="*.csv", 
-           full.names = T) %>% 
-  head(10) %>%  # only 10 files so that we can test faster
-  purrr::map(~ csv_to_disk.frame(.x, 
-                                 outdir=paste0("tmp", readr::parse_number(.x)),
-                                 shardby = c("Year","Month"),
-                                 overwrite=T)) %>% 
-  rbindlist.disk.frame(outdir = "flights_all_sharded.df", by_chunk_id = T)
-print(timetaken(pt))
\ No newline at end of file
diff --git a/tutorials/nyc-taxi/nyc-taxi.Rmd b/tutorials/nyc-taxi/nyc-taxi.Rmd
deleted file mode 100644
index 658fcc8d..00000000
--- a/tutorials/nyc-taxi/nyc-taxi.Rmd
+++ /dev/null
@@ -1,70 +0,0 @@
----
-title: "NYC Taxi"
-author: "ZJ"
-date: "8/12/2019"
-output: html_document
----
-
-```{r setup, include=FALSE}
-knitr::opts_chunk$set(echo = TRUE)
-```
-
-
-
-```{r}
-library(disk.frame)
-setup_disk.frame()
-options(future.globals.maxSize = Inf)
-```
-
-## Loading the data
-```{r}
-system.time(nyc.df <- csv_to_disk.frame(list.files("c:/data/nyc-taxi-data/",pattern="*.csv", full.names = TRUE)))
-```
-
-```{r}
-sapply(list.files("c:/data/nyc-taxi-data/",pattern="*.csv", full.names = TRUE), function(x) {
-  ncol(data.table::fread(x, nrows = 2))
-})
-```
-
-```{r}
-nyc.df = disk.frame("nyc.df")
-a = nyc.df[,uniqueN(Dispatching_base_num), keep=names(nyc.df)[1]]
-a
-```
-
-
-```{r}
-nyc.df = disk.frame("nyc.df")
-head(nyc.df)
-```
-
-
-```{r}
-library(magrittr)
-library(lubridate)
-system.time(nyc2.df <- nyc.df %>%
-  cmap(function(chunk) {
-    chunk[!is.na(Dispatching_base_number), Dispatching_base_num := Dispatching_base_number]
-    chunk[, Dispatching_base_number:=NULL]
-    chunk[, Pickup_DateTime := lubridate::as_datetime(Pickup_DateTime)]
-    chunk[!is.na(Pickup_date), Pickup_DateTime := lubridate::as_datetime(Pickup_date)]
-    chunk[,Pickup_date:=NULL]
-    chunk
-  }, outdir="nyc2.df", lazy = FALSE, overwrite = TRUE))
-```
-
-```{r}
-nyc2.df = disk.frame("nyc2.df")
-head(nyc2.df)
-```
-
-
-```{r}
-system.time(shard(nyc2.df, shardby = "Dispatching_base_num", outdir="nyc.sharded.df"))
-```
-
-```{r}
-
-```
\ No newline at end of file
diff --git a/tutorials/readme.r b/tutorials/readme.r
deleted file mode 100644
index bc7d241e..00000000
--- a/tutorials/readme.r
+++ /dev/null
@@ -1,94 +0,0 @@
-#install.packages(c("fst","future","data.table"))
-if(!require(devtools)) install.packages("devtools")
-if(!require(disk.frame)) devtools::install_github("xiaodaigh/disk.frame")
-
-library(dplyr)
-library(disk.frame)
-library(data.table)
-library(fst)
-#library(future)
-#library(future.apply)
-#library(data.table)
-nworkers = parallel::detectCores(logical=F)
-cat(nworkers," workers\n")
-#plan(multiprocess, workers = nworkers, gc = T)
-#options(future.globals.maxSize=Inf)
-
-rows_per_chunk = 1e7
-# generate synthetic data
-tmpdir = "tmpfst"
-fs::dir_delete(tmpdir)
-
-# write out 2*nworkers chunks
-pt = proc.time()
-df = disk.frame(tmpdir)
-purrr::walk(1:(nworkers*2), function(ii) {
-  system.time(ab <- data.table(a = runif(rows_per_chunk), b = runif(rows_per_chunk)) ) #102 seconds
-  add_chunk(df, ab, ii)
-})
-cat("Generating data took: ", timetaken(pt), "\n")
-
-
-# read and output the disk.frame as it to assess "sequential" read-write performance
-pt = proc.time()
-df2 <- map.disk.frame(df, ~.x, outdir = "tmpfst2", lazy = F, overwrite = T)
-cat("Read and write took: ", timetaken(pt), "\n")
-
-# get first few rows
-head(df)
-
-# get last few rows
-tail(df)
-
-# number of rows
-nrow(df)
-
-# number of columns
-ncol(df)
-
-
-# dplyr verbs -------------------------------------------------------------
-df = disk.frame(tmpdir)
-
-df %>%
-  summarise(suma = sum(a)) %>% # this does a count per chunk
-  collect(parallel = T)
-
-# need a 2nd stage to finalise summing
-df %>%
-  summarise(suma = sum(a)) %>% # this does a count per chunk
-  collect(parallel = T) %>% 
-  summarise(suma = sum(suma)) 
-
-# filter
-pt = proc.time()
-system.time(df_filtered <- df %>% 
-              filter(a < 0.1))
-cat("filtering a < 0.1 took: ", timetaken(pt), "\n")
-nrow(df_filtered)
-
-# group by
-pt = proc.time()
-res1 <- df %>% 
-  filter(b < 0.1) %>% 
-  mutate(blt005 = b < 0.05) %>% 
-  group_by(blt005, hard = T) %>% # hard group_by is slower but avoid a 2nd stage aggregation
-  summarise(suma = sum(a), n = n()) %>% 
-  collect(parallel = T)
-cat("group by took: ", timetaken(pt), "\n")
-
-
-# keep only one var is faster
-pt = proc.time()
-res1 <- df %>% 
-  srckeep("a") %>% #keeping only the column `a` from the input
-  summarise(suma = sum(a), n = n()) %>% 
-  collect(parallel = T)
-cat("summarise keeping only one column ", timetaken(pt), "\n")
-
-# same operation without keeping
-pt = proc.time()
-res1 <- df %>% 
-  summarise(suma = sum(a), n = n()) %>% 
-  collect(parallel = T)
-cat("summarise without keeping", timetaken(pt), "\n")
diff --git a/tutorials/surf_2019_02_demo/.gitignore b/tutorials/surf_2019_02_demo/.gitignore
deleted file mode 100644
index bef9413c..00000000
--- a/tutorials/surf_2019_02_demo/.gitignore
+++ /dev/null
@@ -1,5 +0,0 @@
-.ipynb_checkpoints
-surf_201902.nb.html
-surf_201902.html
-01_surf_201902.html
-01_surf_201902.tex
diff --git a/tutorials/surf_2019_02_demo/00_setup.r b/tutorials/surf_2019_02_demo/00_setup.r
deleted file mode 100644
index 5cdf0dec..00000000
--- a/tutorials/surf_2019_02_demo/00_setup.r
+++ /dev/null
@@ -1,41 +0,0 @@
-library(glue)
-library(purrr)
-library(fst)
-library(tidyr)
-library(ggplot2)
-library(stringr)
-library(xgboost)
-library(lubridate)
-library(disk.frame)
-
-raw_perf_data_path = "C:/data/Performance_All/"
-#raw_perf_data_path = "d:/data/Performance_All_small/"
-
-# where the outputs go
-outpath = "c:/data/fannie_mae_disk_frame/"
-#outpath = "d:/data/fannie_mae_disk_frame_small/"
-
-Performance_ColClasses = 
-  c("character", "character", "character", "numeric", "numeric", "numeric", "numeric", 
-    "numeric", "character", "character", "character", "character", "character", "character", 
-    "character", "character", "character", "numeric", "numeric", "numeric", "numeric", "numeric", 
-    "numeric", "numeric", "numeric", "numeric", "numeric", "numeric", "character", "numeric", 
-    "character")
-Performance_Variables = 
-  c("LOAN_ID", "Monthly.Rpt.Prd", "Servicer.Name", "LAST_RT", "LAST_UPB", "Loan.Age", 
-    "Months.To.Legal.Mat" , "Adj.Month.To.Mat", "Maturity.Date", "MSA", "Delq.Status", 
-    "MOD_FLAG", "Zero.Bal.Code", "ZB_DTE", "LPI_DTE", "FCC_DTE","DISP_DT", "FCC_COST", 
-    "PP_COST", "AR_COST", "IE_COST", "TAX_COST", "NS_PROCS", "CE_PROCS", "RMW_PROCS", 
-    "O_PROCS", "NON_INT_UPB", "PRIN_FORG_UPB_FHFA", "REPCH_FLAG", "PRIN_FORG_UPB_OTH", 
-    "TRANSFER_FLG") %>% tolower
-
-dfiles = dir(raw_perf_data_path, full.names = T)
-short_dfiles = dir(raw_perf_data_path)
-
-Acquisitions_Variables = c("LOAN_ID", "ORIG_CHN", "Seller.Name", "ORIG_RT", "ORIG_AMT", "ORIG_TRM", "ORIG_DTE"
-                           ,"FRST_DTE", "OLTV", "OCLTV", "NUM_BO", "DTI", "CSCORE_B", "FTHB_FLG", "PURPOSE", "PROP_TYP"
-                           ,"NUM_UNIT", "OCC_STAT", "STATE", "ZIP_3", "MI_PCT", "Product.Type", "CSCORE_C", "MI_TYPE", "RELOCATION_FLG") %>% tolower()
-
-Acquisition_ColClasses = c("character", "character", "character", "numeric", "numeric", "integer", "character", "character", "numeric",
-                           "numeric", "character", "numeric", "numeric", "character", "character", "character", "character", "character",
-                           "character", "character", "numeric", "character", "numeric", "numeric", "character")
diff --git a/tutorials/surf_2019_02_demo/01_surf_201902.rmd b/tutorials/surf_2019_02_demo/01_surf_201902.rmd
deleted file mode 100644
index 418ef86f..00000000
--- a/tutorials/surf_2019_02_demo/01_surf_201902.rmd
+++ /dev/null
@@ -1,131 +0,0 @@
----
-title: "Simple Fannie Mae Example"
-output:
-  html_document:
-    df_print: paged
-  pdf_document: default
----
-
-```{r}
-# 2_exploratory.r
-#source("../fannie_mae/00_setup.r")
-source("00_setup.r")
-```
-
-# Assign a `disk.frame`
-You use `disk.frame(path)` to assign a disk.frame
-
-```{r}
-dir(file.path(outpath, "fm_with_harp")) %>% 
-  head
-```
-
-Use the function `disk.frame` to assign a folder
-```{r}
-fm_with_harp = 
-  disk.frame(file.path(outpath, "fm_with_harp"))
-
-fm_with_harp
-```
-
-## Simple info about disk.frame
-```{r}
-nrow(fm_with_harp)
-
-ncol(fm_with_harp)
-```
-
-
-```{r}
-head(fm_with_harp)
-```
-
-```{r}
-tail(fm_with_harp)
-```
-
-## Simple group-by example
-
-I want create the `sum(default_12m)` group by month. The sql statement looks like
-
-```sql
-select
-  monthly.rpt.prd,
-  sum(default_12m) as n_defaults,
-  count(*) as cnt,
-  n_default/cnt as odr # observed default rate
-from
-  table
-group by
-  monthly.rpt.prd;
-```
-This analysis only uses two columns, namely `default_12m` and `monthly.rpt.prd`. So I use `srckeep` to ensure that only those two columns are loaded.
-
-The backend for disk.frame is fst which allows **random access** to columns AKA load only what you need! This is importance for performance!
-```{r}
-system.time(a_wh1 <- fm_with_harp %>% 
-  srckeep(c("default_12m","monthly.rpt.prd")) %>% 
-  group_by(monthly.rpt.prd) %>% 
-  summarise(
-    N = n(), 
-    n_defaults = sum(default_12m, na.rm = T)))
-```
-The error recognises that the `shardkey` is `loan_id` which is not the group-by variable - `monthly.rpt.prd`.
-
-The operations returned almost instantly, this is because the computation was done lazily. For now operations are recorded. No computation will take place until `collect` or `compute` is called.
-
-Now **play** the recording by calling `collect`. This will take a minute. Back to the slides:
-
-```{r}
-system.time(a_wh2 <- a_wh1 %>% collect) # 60~70 plugged in
-
-a_wh2
-```
-
-```{r}
-# temporary
-# a_wh1 %>% 
-#   srckeep(c("monthly.rpt.prd", "default_12m")) %>% 
-#   map(function(chunk) {
-#     chunk[1,]
-#   }) %>% 
-#   collect
-```
-
-
-```{r}
-# temporary
-# a_wh1 %>% 
-#   srckeep(c("monthly.rpt.prd", "default_12m")) %>% 
-#   map(~{
-#     .x[1,.SD]
-#   }) %>% 
-  # collect
-```
-
-
-once `collect` is called the resultant data is stored as a data.frame.
-
-However this is not the correct result, as the group by was performed within each chunk. Hence we need a second stage group by. The second group by takes no time at all, as everything was done in memory
-
-```{r}
-system.time(a_wh3 <- a_wh2 %>%
-  group_by(monthly.rpt.prd) %>% 
-  summarise(
-    odr = sum(n_defaults)/sum(N)
-  ) %>% 
-  rename(
-    Date = monthly.rpt.prd,
-    `Observed Default Rate%` = odr
-  ))
-```
-
-Manipulate the data and plot
-```{r}
-a_wh4 = a_wh3 %>% 
-  gather(key = type, value=rate, -Date)
-
-ggplot(a_wh4) + 
-  geom_line(aes(x=Date, y = rate, colour = type)) +
-  ggtitle("Fannie Mae Observed Default Rate over time & HARP Conversion Rate")
-```
diff --git a/tutorials/surf_2019_02_demo/10_d_Fannie_mae_1_8b.r b/tutorials/surf_2019_02_demo/10_d_Fannie_mae_1_8b.r
deleted file mode 100644
index 81c1a330..00000000
--- a/tutorials/surf_2019_02_demo/10_d_Fannie_mae_1_8b.r
+++ /dev/null
@@ -1,6 +0,0 @@
-source("inst/fannie_mae/00_setup.r")
-
-fm_with_harp = 
-  disk.frame(file.path(outpath, "fm_with_harp"))
-
-fm_with_harp
diff --git a/tutorials/surf_2019_02_demo/10_d_Fannie_mae_1_8b.rmd b/tutorials/surf_2019_02_demo/10_d_Fannie_mae_1_8b.rmd
deleted file mode 100644
index 4a6f760d..00000000
--- a/tutorials/surf_2019_02_demo/10_d_Fannie_mae_1_8b.rmd
+++ /dev/null
@@ -1,54 +0,0 @@
----
-title: "Simple Fannie Mae Example"
-output:
-  html_document:
-    df_print: paged
-  pdf_document: default
----
-
-```{r}
-source("../fannie_mae/00_setup.r")
-
-fm_with_harp = 
-  disk.frame(file.path(outpath, "fm_with_harp"))
-
-fm_with_harp
-```
-
-
-```{r}
-system.time(a_wh1 <- fm_with_harp %>% 
-  srckeep(c("default_12m","monthly.rpt.prd")) %>% 
-  group_by(monthly.rpt.prd) %>% 
-  summarise(
-    N = n(), 
-    n_defaults = sum(default_12m, na.rm = T)) %>% 
-  collect %>%
-  group_by(monthly.rpt.prd) %>% 
-  summarise(
-    odr = sum(n_defaults)/sum(N)
-  ) %>% 
-  rename(
-    Date = monthly.rpt.prd,
-    `Observed Default Rate%` = odr
-  ))
-```
-
-
-```{r}
-a_wh4 = a_wh1 %>% 
-  gather(key = type, value=rate, -Date)
-
-ggplot(a_wh4) + 
-  geom_line(aes(x=Date, y = rate, colour = type)) +
-  ggtitle("Fannie Mae Observed Default Rate over time & HARP Conversion Rate")
-```
-
-$$
-600 = a + b\times\log(\frac{1-p}{p}) = a + b\times\log(30)\\
-620 = a + b\times(\alpha + \vec{X}^T\vec{\beta}) = a + b\times\log(60)
-$$
-
-$$
-620 = a + b\times(\alpha + \vec{X}^T\vec{\beta}) = a + b\times\log(60)
-$$ 
\ No newline at end of file
diff --git a/tutorials/surf_2019_02_demo/10_e_AutoML_XGBoost_Scorecards.r b/tutorials/surf_2019_02_demo/10_e_AutoML_XGBoost_Scorecards.r
deleted file mode 100644
index a87a8945..00000000
--- a/tutorials/surf_2019_02_demo/10_e_AutoML_XGBoost_Scorecards.r
+++ /dev/null
@@ -1,112 +0,0 @@
-source("inst/fannie_mae_10pct/00_setup.r")
-library(disk.frame)
-library(xgboost)
-acqall_dev = disk.frame(file.path(outpath, "appl_mdl_data_sampled_dev"))
-  
-add_var_to_scorecard = disk.frame:::add_var_to_scorecard
-
-# 5d develop a function to test all variables
-check_which_is_best <- function(df, target, features, monotone_constraints, format_fns, weight=NULL) {
-  
-  prev_pred = NULL
-  ws = NULL
-  vars_scr = NULL
-  
-  while(length(features) > 0) {
-    # try every var
-    
-    res = furrr::future_map(1:length(features), ~add_var_to_scorecard(
-    #res = purrr::map(1:length(features), ~add_var_to_scorecard(
-      df, 
-      target,
-      features[.x], 
-      monotone_constraints = monotone_constraints[.x],
-      prev_pred = prev_pred,
-      format_fn = format_fns[[.x]],
-      weight,
-      save_model_fname = glue::glue("{features[.x]}.xgbm")
-    ))
-    
-    # which has the best auc
-    w = which.max(map_dbl(res, ~.x$auc))
-    ws = c(ws, w)
-    feature_to_fit = features[w]
-    print(glue::glue("choosen: {feature_to_fit}"))
-    
-    var_scr1 = res[[w]]
-    eval(parse(text = glue::glue("vars_scr = c(vars_scr, list({feature_to_fit} = var_scr1))")))
-    
-    features = features[-w]
-    format_fns = format_fns[-w]
-    monotone_constraints = monotone_constraints[-w]
-    
-    prev_pred = var_scr1$prev_pred
-  }
-  vars_scr
-}
-
-num_vars =     c("mi_pct", "orig_amt", "orig_rt", "ocltv", "mi_type", "cscore_c", "oltv", "dti", "orig_trm", "cscore_b", "num_bo", "num_unit", "orig_dte", "frst_dte")
-num_vars_mon = c(0       ,  1        , 1        , 1      , 0        ,  -1        , 1     , 1    ,     1     ,  -1        ,  -1     , 0         ,       0   ,  0)
-num_var_fmt_fn = c(map(1:10, ~base::I), map(1:2, ~as.numeric), map(1:2, ~function(x) {
-  as.numeric(substr(x, 4, 7))
-}))
-
-cat_vars = setdiff(names(acqall_dev), num_vars) %>% 
-  setdiff(
-    c("loan_id", "default_next_12m", "first_default_date", "mths_to_1st_default", "zip_3", "weight"))
-
-pt = proc.time()
-res_all = check_which_is_best(
-  acqall_dev, 
-  "default_next_12m", 
-  features = c(num_vars, cat_vars),
-  monotone_constraints = c(num_vars_mon, rep(0, length(cat_vars))),
-  format_fns = c(num_var_fmt_fn, map(1:length(cat_vars), ~base::I)),
-  weight = "weight"
-)
-timetaken(pt)
-
-aucs = map_dbl(res_all, ~.x$auc)
-plot(
-  aucs,
-  main = "Fannie Mae Application Scorecard AUC", 
-  ylab="AUC", 
-  xlab = "Number of features")
-
-if(F) {
-  library(gganimate)
-  auc_dt = data.table(num_of_vars = 1:length(aucs), auc = aucs)
-  auc_dt %>% 
-    ggplot(aes(num_of_vars, auc)) + 
-    geom_point() +
-    transition_states(
-      num_of_vars,
-      transition_length  = 2,
-      state_length = 1
-    ) + 
-    enter_fade() +
-    exit_shrink() + 
-    ease_aes('sine-in-out') 
-}
-
-#saveRDS(res_all, "model.rds")
-
-if(F) {
-  rescat = check_which_is_best(
-    acqall_dev, 
-    "default_next_12m", 
-    c("seller.name", "oltv", "state", "dti", "mi_type"),
-    c(0            , 1     , 0      , 1    , 0     ),
-    map(1:5, ~base::I)
-  )
-  
-  
-  system.time(res <- check_which_is_best(
-    acqall1, 
-    "default_next_12m", 
-    c("ocltv", "cscore_c", "mi_type", "oltv", "dti", "orig_trm", "cscore_b", "num_bo"),
-    c(1      , -1        , 0        , 1     , 1    ,     1     , -1        ,  -1     ),
-    c(map(1:7, ~base::I), list(as.numeric))))
-}
-
-
diff --git a/tutorials/surf_2019_02_demo/10_f_make_scorecard_visible.R b/tutorials/surf_2019_02_demo/10_f_make_scorecard_visible.R
deleted file mode 100644
index 675a81c2..00000000
--- a/tutorials/surf_2019_02_demo/10_f_make_scorecard_visible.R
+++ /dev/null
@@ -1,118 +0,0 @@
-source("inst/fannie_mae/00_setup.r")
-library(disk.frame)
-
-auc = disk.frame:::auc
-acqall_val = disk.frame(file.path(outpath, "appl_mdl_data_sampled_val2"))
-df = acqall_val
-#the model
-mdl = readRDS("model.rds")
-
-#' score one variable
-score_one_var <- function(feature, x, bins, bias, format_fn = base::I) {
-  print(feature)
-  # numeric features (post format_fn) has only two columns in bins
-  if(ncol(bins) == 2) {
-    x1 = format_fn(x)
-    binsNA = evalparseglue("bins[is.na({feature}),]")
-    bins = evalparseglue("bins[!is.na({feature}),]")
-    score = bins$score[cut(x1, c(-Inf, bins[[2]], Inf))]
-    
-    if(nrow(binsNA) == 1) {
-      score[is.na(x1)] = binsNA[[1]]
-    }
-  } else {
-    x1 = format_fn(x)
-    x1df = evalparseglue("data.table({feature} = x1)")
-    x1df_scr = evalparseglue("merge(x1df, bins, by = '{feature}', all.x=T)")
-    score = x1df_scr$score
-  }
-  # if there are still na then assign the bias
-  #browser(expr = any(is.na(score)))
-  if(any(is.na(score))) {
-    warning(glue::glue("the feature `{feature}` has {sum(is.na(score))} unassigned scores out of {length(score)}, assigning them the bias = {bias}"))
-    score[is.na(score)] = bias
-  }
-  
-  score
-}
-
-
-#' Socre a scorecard
-#' @param mdl the xg scorecard model
-#' @param df a disk.frame
-score_xg_scorecard <- function(df, mdl) {
-  res = furrr::future_map_dfc(mdl, ~{
-  #res = purrr::map_dfc(mdl, ~{
-    var = df %>% 
-      srckeep(.x$feature) %>% 
-      collect(parallel = F)
-    x = var[[1]]
-    feature = .x$feature
-    bins = .x$bins
-    bias = .x$bias
-    format_fn = .x$format_fn
-    score_one_var(feature, x, bins, bias, format_fn)
-  }) %>% rowSums
-  res
-}
-
-plot_auc <- function(df, line = F) {
-  target = df %>% 
-    srckeep("default_next_12m") %>% 
-    collect(parallel = F)
-  
-  pt = proc.time()
-  target$score = score_xg_scorecard(df, mdl)
-  timetaken(pt)
-  
-  AUC = auc(target$default_next_12m, target$score)
-  GINI = 2*AUC-1
-  
-  target[,negscore := -score]
-  setkey(target, negscore)
-  cts = target[,.(ctot = (1:.N)/.N, cbad = cumsum(default_next_12m)/sum(default_next_12m))]
-  
-  if(line == F) {
-    plot(cts[seq(1,.N, length.out=100),], type="l", xlim=c(0,1),
-         main=glue::glue("On Validation - AUC: {AUC %>% round(2)}; GINI: {GINI %>% round(2)}"))
-    abline(a=0,b=1,lty=2)
-    abline(h=0)
-    abline(v=0)
-  } else {
-    lines(cts[seq(1,.N, length.out=100),], lty = 3, col = "blue")
-    legend("topleft", c("val AUC", "random", "dev AUC"), lty=c(1,2,3), col=c("black","blue","black"))
-  }
-}
-
-system.time(plot_auc(acqall_val))
-
-# score on whole ----------------------------------------------------------
-acqall_dev = disk.frame(file.path(outpath, "appl_mdl_data_sampled_dev2"))
-system.time(plot_auc(acqall_dev, line = T))
-
-scorecard = map_dfr(mdl, ~{
-  res = .x$bins
-  evalparseglue("res[,feature_lbl := as.character({.x$feature})]")
-  res[,variable := .x$feature]
-  res %>% 
-    select(variable, feature_lbl, score ) %>% 
-    mutate(score = round(-score*20/log(2)))
-})
-
-saveRDS(scorecard, "scorecard.rds")
-scorecard = readRDS("scorecard.rds")
-#View(scorecard)
-
-scorecard[,
-          low := c(-Inf, feature_lbl[-.N]), variable]
-setnames(scorecard, "feature_lbl", "high")
-
-scorecard=scorecard[,.(variable, low, high, score)]
-
-DT::datatable(scorecard, options = list(pageLength=20))
-
-# score on whole ----------------------------------------------------------
-# acqall = disk.frame(file.path(outpath, "appl_mdl_data"))
-# system.time(plot_auc(acqall))
-
-
diff --git a/tutorials/surf_2019_02_demo/11_speedglm.R b/tutorials/surf_2019_02_demo/11_speedglm.R
deleted file mode 100644
index 3e180cb9..00000000
--- a/tutorials/surf_2019_02_demo/11_speedglm.R
+++ /dev/null
@@ -1,51 +0,0 @@
-source("inst/fannie_mae/00_setup.r")
-library(disk.frame)
-library(speedglm)
-library(biglm)
-
-acqall_dev = disk.frame(file.path(outpath, "appl_mdl_data"))
-library(speedglm)
-
-head(acqall_dev)
-
-#' A streaming function for speedglm
-#' @param df a disk.frame
-stream_shglm <- function(df) {
-  i = 0
-  is = sample(nchunks(df), replace = F)
-  function(reset = F) {
-    
-    if(reset) {
-      print("reset")
-      i <<- 0
-    } else {
-      i <<- i + 1
-      #if (i > 3) {
-      if (i > nchunks(df)) {
-        return(NULL)
-      }
-      print(glue::glue("streaming: {i}/{nchunks(df)}"))
-      return(get_chunk(df, is[i]))
-      #return(get_chunk(df, i))
-    }
-  }
-}
-
-acqall_dev1 = acqall_dev %>% delayed(~{
-  .x[,oltv_band := cut(oltv, c(-Inf, seq(60,100,by=20)))]
-  .x[,dti_band := addNA(cut(dti, c(-Inf, c(20,100), Inf)))]
-  .x
-})
-
-streamacq = stream_shglm(acqall_dev1)
-
-m = bigglm(
-  default_next_12m ~ oltv_band-1, 
-  data = streamacq, 
-  family=binomial())
-summary(m)
-
-# not gonna work
-shglm(default_next_12m ~ oltv_band -1 #+ dti_band + - 1
-      , 
-      datafun = streamacq, family=binomial())
\ No newline at end of file
diff --git a/tutorials/surf_2019_02_demo/12_keras.r b/tutorials/surf_2019_02_demo/12_keras.r
deleted file mode 100644
index e038fb09..00000000
--- a/tutorials/surf_2019_02_demo/12_keras.r
+++ /dev/null
@@ -1,180 +0,0 @@
-source("inst/fannie_mae_10pct/00_setup.r")
-library(disk.frame)
-library(keras)
-
-acqall_dev = disk.frame(file.path(outpath, "appl_mdl_data_sampled"))
-
-#' A streaming function for speedglm
-#' @param df a disk.frame
-stream_shglm <- function(df) {
-  i = 0
-  is = sample(nchunks(df), replace = F)
-  function(reset = F) {
-    
-    if(reset) {
-      print("you've reset")
-      i <<- 0
-    } else {
-      i <<- i + 1
-      #if (i > 4) {
-      if (i > nchunks(df)) {
-        return(NULL)
-      }
-      print(glue::glue("{i}/{nchunks(df)}"))
-      return(get_chunk(df, is[i]))
-      #return(get_chunk(df, i))
-    }
-  }
-}
-
-acqall_dev1 = acqall_dev %>% delayed(~{
-  .x[,oltv_band := cut(oltv, c(-Inf, 60,80, Inf))]
-  #.x[,scr_band := addNA(cut(cscore_b, c(-Inf, 627,700, Inf)), ifany=F)]
-  .x[,scr_band := addNA(cut(cscore_b, c(-Inf, 700,716,725,742,748,766,794, Inf)), ifany=F)]
-  
-  .x
-})
-
-if(F) {
-  aa = acqall_dev1 %>% collect
-  glm(default_next_12m ~ oltv_band + scr_band - 1, data=aa)
-}
-
-head(acqall_dev1)
-
-# compile a kera model
-build_model <- function() {
-  model <- keras_model_sequential() %>%
-    layer_dense(units = 2, activation = 'softmax')
-  
-  
-  model %>% compile(
-    loss = "categorical_crossentropy",
-    optimizer = 'sgd'
-  )
-  
-  model
-}
-
-# compiling the model ~ 
-system.time(model <- build_model())
-
-#ol = levels(acqall_dev1[,oltv_band, keep=c("oltv","cscore_b")])
-#dl = levels(acqall_dev1[,scr_band, keep=c("oltv","cscore_b")])
-
-ol = levels(get_chunk(acqall_dev1,1)[,oltv_band])
-dl = levels(get_chunk(acqall_dev1,1)[,scr_band])
-
-
-#for(i in 1:nchunks(acqall_dev1)) {
-kk <- function() {
-  
-  j = 0
-  done = F
-  ii = 0
-  while(!done) {
-    j <- j + 1
-    si = sample(nchunks(acqall_dev1), nchunks(acqall_dev1)*0.3)
-    osi = setdiff(1:nchunks(acqall_dev1), si)
-    
-    system.time(a <- map_dfr(si, ~{
-      
-      ii <- ii + 1
-      i = .x
-      if(ii %% 20 == 0) print(glue::glue("{j}:{ii} {Sys.time()}"))
-      
-      a = get_chunk(acqall_dev1, i)
-      #a[,.(sum(default_next_12m), .N),oltv_band]
-      
-      a1 = 
-        cbind(
-          a[,keras::to_categorical(as.integer(oltv_band) - 1)]
-          #,a[,keras::to_categorical(as.integer(scr_band) - 1)]
-        )
-      
-      at = a[,default_next_12m*1]
-      Y_train = keras::to_categorical(at)
-      
-      hist = model %>% fit(
-        a1,
-        Y_train,
-        epochs = 1,
-        validation_split = 0.2,
-        verbose = 0
-      )
-      
-      gw = get_weights(model)
-      
-      gwdt = as.data.table(gw[1])
-      setnames(gwdt, names(gwdt), c("non_default", "default"))
-      gwdt$i =i
-      gwdt$var = c(
-        rep("oltv_band", length(ol))
-        #, rep("scr_band", length(dl))
-      )
-      gwdt$band = c(ol,dl
-      )
-      gwdt
-    }))
-    
-    some_chunks <- map(osi, ~{
-      i = .x
-      a = get_chunk(acqall_dev1, i)
-      
-      a1 = 
-        cbind(
-          a[,keras::to_categorical(as.integer(oltv_band) -1)]
-          #,a[,keras::to_categorical(as.integer(scr_band)-1)]
-        )
-      a1
-    }) %>% reduce(rbind)
-    
-    outcomes <- map(osi, ~{
-      i = .x
-      a = get_chunk(acqall_dev, i, keep=c("default_next_12m"))
-      a[,default_next_12m]
-    }) %>% unlist
-    
-    auc = auc(outcomes, predict(model, some_chunks)[,2])
-    
-    # scores
-    p = predict(model, diag(
-      length(ol)
-      #+length(dl)
-      ))[,2]
-    a_b = log(p/(1-p))
-    
-    scalar = 20/log(2)
-    intercept = 20*log(536870912/15)/log(2)
-    
-    a = mean(a_b)
-    b = a_b - a
-    done <- length(unique(sign(diff(b)))) == 1
-    
-    scrs = data.table(base_scr = round(a*scalar+intercept,0), scr = round(-b*scalar,0))
-    scrs$var = c(
-      rep("oltv_band", length(ol))
-      #, rep("scr_band", length(dl))
-    )
-    scrs$band = c(ol
-                  , dl
-    )
-    scrs = scrs[band != "bias"]
-    
-    scrs = scrs[,.(var, band, scr, base_scr)]
-    print(glue::glue("AUC: {auc}"))
-    print(scrs)
-  }  
-  scrs
-}
-
-pt = proc.time()
-scrs = kk()
-timetaken(pt)
-View(scrs)
-
-#AUC: 0.599790504924901
-# var      band scr base_scr
-# 1: oltv_band (-Inf,60]  10      351
-# 2: oltv_band   (60,80]  10      351
-# 3: oltv_band (80, Inf] -21      351
\ No newline at end of file
diff --git a/tutorials/surf_2019_02_demo/13_logistic_regression.r b/tutorials/surf_2019_02_demo/13_logistic_regression.r
deleted file mode 100644
index db822586..00000000
--- a/tutorials/surf_2019_02_demo/13_logistic_regression.r
+++ /dev/null
@@ -1,201 +0,0 @@
-
-two_var = acqall_dev[,.(cscore_b, ocltv, mi_pct, default_next_12m)]
-
-all_vars = collect(acqall_dev)
-
-non_char = 
-  names(all_vars)[sapply(all_vars, typeof) != c("character")] %>% 
-  setdiff("first_default_date")
-
-# 
-# all_vars = tibble(all_vars)
-# library(dplyr)
-# cor(
-#   all_vars
-#   , use = "pairwise.complete.obs")
-
-a = sort(non_char)
-alvarsn = all_vars[, ..a]
-
-cdt = cor(alvarsn, use = "pairwise.complete.obs")
-
-cdt2 = cdt[,which(colnames(cdt) == "default_next_12m")]
-
-cdt3 = data.table(names = names(cdt2), cor = cdt2, abs_cor = abs(cdt2))
-
-cdt3 = cdt3[order(abs_cor, decreasing = T)]
-
-DT::datatable(cdt3)
-
-
-corrplot::corrplot(cdt, method="color", diag=F)
-
-# Binning
-
-two_var1 = two_var[!is.na(ocltv),][order(ocltv),.(ocltv, default_next_12m)]
-
-
-two_var2 = two_var1[, .(ndef = sum(default_next_12m), .N), ocltv]
-
-two_var3 = two_var2[,.(ocltv, ndef, N, GB_odds = cumsum(N)/cumsum(ndef))]
-
-
-wm = which(two_var3$GB_odds == max(Filter(is.finite, two_var3$GB_odds)))
-wm3 = wm
-two_var3[,.(ocltv, GB_odds)] %>% 
-  plot(main = "Cumulative Good/Bad Odds", 
-       xlab="Originative LTV (ocltv)",
-       ylab="Observed Default Rate (ODR)",
-       ylim=c(0,600))
-
-abline(v = two_var3[wm, ocltv], lty=2)
-legend("bottomright", c("ODR", "max ODR location")
-       , lty=c(NA,2)
-       , pch = c(1,NA)
-       )
-
-## plot
-two_var4 = two_var3[-(1:wm),.(ocltv, N, ndef, GB_odds = cumsum(N)/cumsum(ndef))]
-
-two_var3[-(1:wm),.(ocltv, GB_odds = cumsum(N)/cumsum(ndef))] %>% 
-  plot(main = "Cumulative Observed Default Rate", xlab="Originative LTV (ocltv)",
-       ylab="Observed Default Rate (ODR)", 
-       xlim=c(0, 120), 
-       ylim = c(0, 600))
-
-lines(two_var3[, .(c(0,two_var3$ocltv[wm]), GB_odds[wm])])
-
-
-wm = which(two_var4$GB_odds == max(Filter(is.finite, two_var4$GB_odds)))
-
-abline(v = two_var4[wm, ocltv], lty=2)
-legend("bottomright", c("ODR", "max ODR location")
-       , lty=c(NA,2)
-       , pch = c(1,NA)
-)
-
-## plot 2
-two_var5 = two_var4[-(1:wm),.(ocltv, N, ndef, GB_odds = cumsum(N)/cumsum(ndef))]
-
-two_var4[-(1:wm),.(ocltv, GB_odds = cumsum(N)/cumsum(ndef))] %>% 
-  plot(main = "Cumulative Observed Default Rate", xlab="Originative LTV (ocltv)",
-       ylab="Observed Default Rate (ODR)", 
-       xlim=c(0, 120), 
-       ylim = c(0, 600))
-
-lines(two_var3[, .(c(0,two_var3$ocltv[wm3]), GB_odds[wm3])])
-
-lines(two_var4[, .(c(two_var3$ocltv[wm3],two_var4$ocltv[wm]), GB_odds[wm])])
-
-wm = which(two_var4$GB_odds == max(Filter(is.finite, two_var4$GB_odds)))
-
-legend("bottomright", c("ODR", "max ODR location")
-       , lty=c(NA,2)
-       , pch = c(1,NA)
-)
-
-abline(v=51, lty=2)
-
-var1 = res_all[[1]]
-
-all_vars[, cscore_bin := cut(
-  all_vars[,var1$feature, with = F][[1]] ,
-  var1$bin[[2]] %>% setdiff(c(701,713))) %>% addNA]
-
-bp1 = all_vars[order(cscore_bin),.(sum(default_next_12m)/.N), cscore_bin]
-barplot(
-  bp1$V1, 
-  names.arg = bp1[[1]], 
-  main = "cscore_b bin Default Rate"
-)
-
-setnames(bp1,"V1","def_rate")
-
-overall_df = all_vars[,sum(default_next_12m)/.N]
-
-bp1[,whole := overall_df]
-
-bp1[,woe := log((1-def_rate)/def_rate) - log((1-whole)/whole)]
-
-DT::datatable(bp1)
-
-glm(def_rate ~ I(-woe), data = bp1, family = binomial)
-
-all_vars[,csr_woe := bp1$woe[cscore_bin]]
-
-DT::datatable(all_vars[1:10,.(default_next_12m, cscore_bin, csr_woe)])
-
-glm(
-  default_next_12m ~ csr_woe,
-  family = binomial,
-  data = all_vars
-)
-
-View(two_var)
-
-
-
-
-
-
-var2 = res_all[[3]]
-all_vars[, ocltv_bin := cut(
-  all_vars[,var2$feature, with = F][[1]],
-  var2$bins[[2]]) %>% addNA]
-
-bp2 = all_vars[order(ocltv_bin),.(sum(default_next_12m)/.N), ocltv_bin]
-barplot(bp2$V1, names.arg = bp2[[1]], main = "ocltv Orig LTV Default Rate")
-
-
-var2 = res_all[[9]]
-var2$feature
-all_vars[, dti_bin := cut(
-  all_vars[,var2$feature, with = F][[1]],
-  var2$bins[[2]]) %>% addNA]
-
-dtrain <- xgboost::xgb.DMatrix(
-  label = two_var[,default_next_12m], 
-  data = as.matrix(two_var[,cscore_b]))
-
-m2 <- xgboost::xgboost(
-  data=dtrain, 
-  monotone_constraints = -1, # awesome!!!
-  nrounds = 1, # one tree ONLY
-  eta = 1, # learning rate 
-  
-  
-
-  objective = "binary:logitraw", 
-  tree_method="exact",
-  max_depth = 2,
-  base_score = two_var[,sum(default_next_12m)/.N]
-)
-
-xgboost::xgb.plot.tree(model=m2)
-
-plot(m2)
-
-bp2 = all_vars[order(dti_bin),.(sum(default_next_12m)/.N), dti_bin]
-barplot(bp2$V1, names.arg=bp2[[1]], main="dti Debt-to-income Ratio Default Rate")
-
-
-dt <- xgb.model.dt.tree(model = m2)
-
-
-bin_eg = data.table(
-  Low = c(-Inf, 597, 657, 716),
-  High = c(597, 657, 716, Inf)
-)
-
-View(bin_eg)
-
-
-m = glm(
-  default_next_12m ~ 
-    cscore_bin + 
-    ocltv_bin + 
-    dti_bin, 
-  family = binomial,
-  data=all_vars)
-
-DT::datatable(broom::tidy(m), options=list(pageLength=16))
diff --git a/tutorials/surf_2019_02_demo/surf_2019_02_demo.ipynb b/tutorials/surf_2019_02_demo/surf_2019_02_demo.ipynb
deleted file mode 100644
index 93e1c58b..00000000
--- a/tutorials/surf_2019_02_demo/surf_2019_02_demo.ipynb
+++ /dev/null
@@ -1,68 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# 2_exploratory.r\n",
-    "source(\"inst/fannie_mae/0_setup.r\")\n",
-    "\n",
-    "fm_with_harp = disk.frame(file.path(outpath, \"fm_with_harp\"))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "head(fm_with_harp)\n",
-    "\n",
-    "nrow(fm_with_harp)\n",
-    "\n",
-    "# need a two stage summary\n",
-    "system.time(a_wh1 <- fm_with_harp %>% \n",
-    "  srckeep(c(\"default_12m\",\"monthly.rpt.prd\")) %>% \n",
-    "  group_by(monthly.rpt.prd) %>% \n",
-    "  summarise(\n",
-    "    N = n(), \n",
-    "    n_defaults = sum(default_12m, na.rm = T)) %>% \n",
-    "  collect(parallel  = T) %>%\n",
-    "  group_by(monthly.rpt.prd) %>% \n",
-    "  summarise(\n",
-    "    odr = sum(n_defaults)/sum(N)\n",
-    "  ) %>% \n",
-    "  rename(\n",
-    "    Date = monthly.rpt.prd,\n",
-    "    `Observed Default Rate%` = odr\n",
-    "  ))\n",
-    "  \n",
-    "\n",
-    "a_wh2 = a_wh1 %>% gather(key = type, value=rate, -Date)\n",
-    "\n",
-    "ggplot(a_wh2) + \n",
-    "  geom_line(aes(x=Date, y = rate, colour = type)) +\n",
-    "  ggtitle(\"Fannie Mae Observed Default Rate over time & HARP Conversion Rate\")"
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "R",
-   "language": "R",
-   "name": "ir"
-  },
-  "language_info": {
-   "codemirror_mode": "r",
-   "file_extension": ".r",
-   "mimetype": "text/x-r-source",
-   "name": "R",
-   "pygments_lexer": "r",
-   "version": "3.5.3"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 2
-}
diff --git a/tutorials/tutorial.r b/tutorials/tutorial.r
deleted file mode 100644
index c3e459ff..00000000
--- a/tutorials/tutorial.r
+++ /dev/null
@@ -1,124 +0,0 @@
-library(disk.frame)
-setup_disk.frame()
-
-a = disk.frame::disk.frame("c:/data/fannie_mae_disk_frame/fm.df/")
-
-
-nrow(a)
-ncol(a)
-
-head(a)
-
-system.time(asample <- a %>%
-  srckeep("loan_id") %>% 
-  filter(!duplicated(loan_id)) %>% 
-  sample_frac(0.01) %>% 
-  collect)
-
-asample = data.frame(loan_id = asample)
-
-system.time(
-  a1 <-a %>% 
-    cmap(~{
-      inner_join(.x, asample, by = "loan_id")
-    }, 
-    lazy = FALSE, 
-    outdir = "c:/data/fannie_mae_disk_frame/fm1pct.df/",
-    overwrite = TRUE))
-
-
-a2 = collect(a1)
-
-data.table::fwrite(a2, "a2.csv")
-arrow::write_parquet(a2, "a2.parquet")
-fst::write_fst(a2, "a2.fst")
-
-a2$date = as.Date(a2$monthly.rpt.prd, "%m/%d/%Y")
-a2$delq.status = as.numeric(a2$delq.status)
-
-a2[,.N,delq.status]
-a2[is.na(delq.status),delq.status := 0]
-
-head(a2)
-
-setkey(a2, loan_id, date)
-
-system.time(
-  a3 <- 
-    setDT(a2)[order(date), apply(.SD[,shift(delq.status, (1:24), type="lead")], 1 , function(x) max(x, na.rm=TRUE)), loan_id]
-)
-
-setnames(a3, names(a3), c("loan_id", "default_n24m"))
-setDT(a3)[is.infinite(default_n24m), default_n24m := 0]
-a3[,.N, default_n24m]
-
-a2$default_n24m = a3$default_n24m >= 4
-
-nrow(a2)
-
-a2[,.N, default_n24m]
-
-arrow::write_parquet(a2, "a2.parquet")
-arrow::write_parquet(a2, "data.parquet")
-fst::write_fst(a2, "a2.fst")
-data.table::fwrite(a2, "a2.csv")
-
-table(sapply(a2, mode))
-
-a2 = a2 %>% 
-  janitor::clean_names()
-
-str(a2$date)
-table(sapply(a2, class))
-
-map2(a2, names(a2), ~{
-  cx = class(.x)
-  list(
-    name = .y, 
-    type = 
-      ifelse(
-        cx == "character", 
-        "text", 
-        ifelse(
-          cx == "Date", 
-          "datetime",
-          ifelse(
-            cx == "logical",
-            "bool",
-            ifelse(
-              cx == "numeric",
-              "numeric",
-              "error"
-            )
-          )
-          )),
-        semantic_type = 
-          ifelse(
-            cx == "character", 
-            "TextColumn", 
-            ifelse(
-              cx == "Date", 
-              "DateTimeColumn",
-              ifelse(
-                cx == "logical",
-                "BooleanColumn",
-                ifelse(
-                  cx == "numeric",
-                  "NumericColumn",
-                  "error"
-                )
-              )
-            ))
-    ) %>% jsonlite::toJSON(auto_unbox = TRUE)
-}) %>% unlist %>% 
-  paste0(collapse = ", ") -> outjson
-
-res  = glue::glue('{"column": [|outjson|]}', .open = "|", .close = "|")
-
-write(res, "schema.json")
-
-
-
-
-
-
diff --git a/tutorials/user_2020/useR! 2020 Tutorial part 1/.gitignore b/tutorials/user_2020/useR! 2020 Tutorial part 1/.gitignore
deleted file mode 100644
index 2960c12d..00000000
--- a/tutorials/user_2020/useR! 2020 Tutorial part 1/.gitignore	
+++ /dev/null
@@ -1 +0,0 @@
-useR!-2020-Tutorial-part-1.html
diff --git a/tutorials/user_2020/useR! 2020 Tutorial part 1/useR! 2020 Tutorial part 1.Rmd b/tutorials/user_2020/useR! 2020 Tutorial part 1/useR! 2020 Tutorial part 1.Rmd
deleted file mode 100644
index 918ac490..00000000
--- a/tutorials/user_2020/useR! 2020 Tutorial part 1/useR! 2020 Tutorial part 1.Rmd	
+++ /dev/null
@@ -1,147 +0,0 @@
----
-title: "useR! 2020 Tutorial Part 1"
-output: learnr::tutorial
-runtime: shiny_prerendered
----
-
-```{r setup, include=FALSE}
-library(learnr)
-knitr::opts_chunk$set(echo = FALSE)
-```
-
-## Intro
-
-### What is disk.frame for?
-
-### Installation
-
-```{r installation, exercise = TRUE}
-# to install CRAN version
-install.packages("disk.frame")
-
-# Uncomment below to install latest development version
-# install.packages("remotes")
-# remotes::install_github("xiaodaigh/disk.frame")
-```
-
-On some platforms, it helps to specify the repo if you encounter an 
-```r
-install.packages("disk.frame", repo="https://cran.rstudio.com")
-```
-### Download data?
-
-### Generate data?
-
-
-## Starting disk.frame
-
-### Exercise 
-
-{disk.frame} can use multiple cores and may transfer data between **workers**.
-
-So there are some setting up code that is recommended for all users.
-
-```{r setting_up, exercise=TRUE}
-library(disk.frame)
-
-```
-
-You should see a message like this from the console
-```
-# this will set up disk.frame with multiple workers
-setup_disk.frame()
-# this will allow unlimited amount of data to be passed from worker to worker
-options(future.globals.maxSize = Inf)
-```
-
-
-```{r setting_up1, exercise=TRUE}
-# this will set up disk.frame with multiple workers
-setup_disk.frame()
-# this will allow unlimited amount of data to be passed from worker to worker
-options(future.globals.maxSize = Inf)
-```
-
-If you forget
-
-```{r setting_up2, exercise=TRUE}
-disk.frame::show_boilerplate()
-
-# or equivalently
-disk.frame::show_ceremony()
-```
-
-TODO insert GIF of inserting code into RStudio.
-
-## Ingesting Data
-
-To use {disk.frame} as your data processing engine, you need to store your data in the disk.frame format.
-
-### The `disk.frame` format
-
-The {disk.frame} format is a folder containing multiple [`fst`](https://www.fstpackage.org/) files. And the files are named `1.fst`, `2.fst` etc, where each `fst` file is called a **chunk**.
-
-TODO add quiz
-
-```{r ingest_data_frame, exercise=TRUE}
-mtcars.df = as.disk.frame(mtcars)
-mtcars.df
-```
-
-This works but if your data fits into RAM, you probably don't need {disk.frame}. But perhaps you data.frame fits into RAM but you can't merge it with another dataset, because that would require additional RAM. So storing it as disk.frame before merging can help.
-
-## Ingesting CSV
-
-```{r print-limit, exercise=TRUE, exercise.eval=TRUE}
-data.table::fwrite(mtcars, "mtcars.csv")
-# alternatively
-#write.csv(mtcars, "mtcars.csv")
-
-mtcars.df = csv_to_disk.frame("mtcars.csv")
-mtcars.df
-```
-
-Notice how `mtcars.df` are stored in a temporary folder? You can move it using
-
-```{r}
-move_to(mmtcars.df, "/some/path/permanent")
-```
-
-or 
-
-```{r}
-mtcars.df = csv_to_disk.frame(
-  "mtcars.csv", 
-  outdir = "/some/path/permanent")
-
-lib1 = libname("some_folder")
-
-lib1$mtcars2.df = lib1$mtcars.df %>% filter(mpg < 0.5)
-```
-
-TODO srckeep("")
-
-
-### Quiz
-
-*You can include any number of single or multiple choice questions as a quiz. Use the `question` function to define a question and the `quiz` function for grouping multiple questions together.*
-
-Some questions to verify that you understand the purposes of various base and recommended R packages:
-
-```{r quiz}
-quiz(
-  question("Which package contains functions for installing other R packages?",
-    answer("base"),
-    answer("tools"),
-    answer("utils", correct = TRUE),
-    answer("codetools")
-  ),
-  question("Which of the R packages listed below are used to create plots?",
-    answer("lattice", correct = TRUE),
-    answer("tools"),
-    answer("stats"),
-    answer("grid", correct = TRUE)
-  )
-)
-```
-
diff --git a/tutorials/vs-vaex/ok.py b/tutorials/vs-vaex/ok.py
deleted file mode 100644
index 64effa8b..00000000
--- a/tutorials/vs-vaex/ok.py
+++ /dev/null
@@ -1,5 +0,0 @@
-import vaex
-
-df = vaex.example()
-df  # Since this is the last statement in a cell, it will print the DataFrame in a nice HTML format.
-
diff --git a/utils/build_utils.R b/utils/build_utils.R
index 33103cf0..4e955594 100644
--- a/utils/build_utils.R
+++ b/utils/build_utils.R
@@ -1,21 +1,3 @@
-# df_bookdown_build <- function() {
-#   rmds = list.files("vignettes", pattern = "*.Rmd")
-#   sapply(rmds, function(file) {
-#     fs::file_copy(
-#       file.path("book", file), 
-#       file.path("vignettes", file),
-#       overwrite = TRUE
-#       )
-#   })
-#   if(fs::dir_exists("book/_bookdown_files")) {
-#     fs::dir_delete("book/_bookdown_files")
-#   }
-#   while(fs::dir_exists("book/_bookdown_files")) {
-#     Sys.sleep(1)
-#   }
-#   rmarkdown::render_site("book", encoding = 'UTF-8')
-# }
-
 df_build_site <- function() {
   df_build_readme()
   df_setup_vignette()
@@ -205,6 +187,12 @@ df_release <- function() {
   devtools::release()
 }
 
+df_ultimate <- function() {
+  df_check()
+  df_release()
+  df_build_site()
+}
+
 if(F) {
   df_check()
 }