In [1]:
# Libraries
library(tidyverse)
library(lubridate)

── [1mAttaching packages[22m ───────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
[32m✔[39m [34mggplot2[39m 3.2.1     [32m✔[39m [34mpurrr  [39m 0.3.3
[32m✔[39m [34mtibble [39m 2.1.3     [32m✔[39m [34mdplyr  [39m 0.8.3
[32m✔[39m [34mtidyr  [39m 1.0.0     [32m✔[39m [34mstringr[39m 1.4.0
[32m✔[39m [34mreadr  [39m 1.3.1     [32m✔[39m [34mforcats[39m 0.4.0
── [1mConflicts[22m ──────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()

Attaching package: ‘lubridate’

The following object is masked from ‘package:base’:

    date



In [2]:
# Load data
if (!exists("loans")) loans <- readRDS("datasets/lending_club_reformatted_paid.rds")
if (!exists("loansIRR"))
  loansIRR <- read.csv("datasets/LoanIRR.csv") %>% as_tibble()

if (!exists("loansCreditMargin"))
  loansCreditMargin <- read.csv("datasets/CreditMargins.csv") %>% as_tibble() %>%
    # Remove monthDefault since already in IRR dataframe
    select(-monthDefault)

nSamples <- nrow(loans)


loansWorkingSet <- loans %>%
  # add IRR and credit margins calculations
  left_join(loansIRR, by = "loanID") %>%
  left_join(loansCreditMargin, by = "loanID") %>%

  # add a % principal loss variable
  mutate(principal_loss_pct = (funded_amnt - total_rec_prncp) / funded_amnt)

In [3]:
# Mostly generated from a LibreOffice spreadsheet for convenience purposes, then reformatted by RStudio.
# This is a copy from 01-startup/Rmd

LC_variable <-
  tibble(
    variable_name = 'loanID' ,
    description = 'NOTE THIS IS NOT AN ORIGINAL VARIABLE. IT WAS ADDED FOR THE PURPOSE OF TRACKING LOANS INDIVIDUALLY AS AND WHEN NEEDED.',
    inModel = TRUE,
    inPrediction = FALSE,
    inPricing = FALSE
  ) %>%
  add_row(
    variable_name = 'loan_amnt' ,
    description = 'The listed amount of the loan applied for by the borrower. If at some point in time, the credit department reduces the loan amount, then it will be reflected in this value.',
    inModel = TRUE,
    inPrediction = FALSE,
    inPricing = FALSE
  ) %>%
  add_row(
    variable_name = 'funded_amnt' ,
    description = 'The total amount committed to that loan at that point in time.',
    inModel = FALSE,
    inPrediction = FALSE,
    inPricing = FALSE
  ) %>%
  add_row(
    variable_name = 'funded_amnt_inv' ,
    description = 'The total amount committed by investors for that loan at that point in time.',
    inModel = FALSE,
    inPrediction = FALSE,
    inPricing = TRUE
  ) %>%
  add_row(
    variable_name = 'term' ,
    description = 'The number of payments on the loan. Values are in months and can be either 36 or 60.',
    inModel = TRUE,
    inPrediction = FALSE,
    inPricing = TRUE
  ) %>%
  add_row(
    variable_name = 'int_rate' ,
    description = 'Interest Rate on the loan',
    inModel = TRUE,
    inPrediction = TRUE,
    inPricing = TRUE
  ) %>%
  add_row(
    variable_name = 'installment' ,
    description = 'The monthly payment owed by the borrower if the loan originates.',
    inModel = FALSE,
    inPrediction = FALSE,
    inPricing = TRUE
  ) %>%
  add_row(
    variable_name = 'grade' ,
    description = 'LC assigned loan grade',
    inModel = FALSE,
    inPrediction = FALSE,
    inPricing = FALSE
  ) %>%
  add_row(
    variable_name = 'sub_grade' ,
    description = 'LC assigned loan subgrade',
    inModel = FALSE,
    inPrediction = TRUE,
    inPricing = FALSE
  ) %>%
  add_row(
    variable_name = 'emp_title' ,
    description = 'The job title supplied by the Borrower when applying for the loan.',
    inModel = FALSE,
    inPrediction = FALSE,
    inPricing = FALSE
  ) %>%
  add_row(
    variable_name = 'emp_length' ,
    description = 'Employment length in years. Possible values are between 0 and 10 where 0 means less than one year and 10 means ten or more years. ',
    inModel = TRUE,
    inPrediction = FALSE,
    inPricing = FALSE
  ) %>%
  add_row(
    variable_name = 'home_ownership' ,
    description = 'The home ownership status provided by the borrower during registration or obtained from the credit report. Our values are: RENT, OWN, MORTGAGE, OTHER, NONE',
    inModel = TRUE,
    inPrediction = FALSE,
    inPricing = FALSE
  ) %>%
  add_row(
    variable_name = 'annual_inc' ,
    description = 'The self-reported annual income provided by the borrower during registration. NOT USED AS A VARIABLE SINCE JOINT INCOME ALREADY INCLUDES IT.',
    inModel = FALSE,
    inPrediction = FALSE,
    inPricing = FALSE
  ) %>%
  add_row(
    variable_name = 'verification_status' ,
    description = 'Indicates if income was verified by LC, not verified, or if the income source was verified',
    inModel = TRUE,
    inPrediction = FALSE,
    inPricing = FALSE
  ) %>%
  add_row(
    variable_name = 'issue_d' ,
    description = 'The month which the loan was funded',
    inModel = TRUE,
    inPrediction = FALSE,
    inPricing = TRUE
  ) %>%
  add_row(
    variable_name = 'loan_status' ,
    description = 'Current status of the loan',
    inModel = FALSE,
    inPrediction = TRUE,
    inPricing = TRUE
  ) %>%
  add_row(
    variable_name = 'pymnt_plan' ,
    description = 'Indicates if a payment plan has been put in place for the loan',
    inModel = FALSE,
    inPrediction = FALSE,
    inPricing = FALSE
  ) %>%
  add_row(
    variable_name = 'url' ,
    description = 'URL for the LC page with listing data.',
    inModel = FALSE,
    inPrediction = FALSE,
    inPricing = FALSE
  ) %>%
  add_row(
    variable_name = 'desc' ,
    description = 'Loan description provided by the borrower',
    inModel = FALSE,
    inPrediction = FALSE,
    inPricing = FALSE
  ) %>%
  add_row(
    variable_name = 'purpose' ,
    description = 'A category provided by the borrower for the loan request. ',
    inModel = TRUE,
    inPrediction = FALSE,
    inPricing = FALSE
  ) %>%
  add_row(
    variable_name = 'title' ,
    description = 'The loan title provided by the borrower',
    inModel = FALSE,
    inPrediction = FALSE,
    inPricing = FALSE
  ) %>%
  add_row(
    variable_name = 'zip_code' ,
    description = 'The first 3 numbers of the zip code provided by the borrower in the loan application.',
    inModel = TRUE,
    inPrediction = FALSE,
    inPricing = FALSE
  ) %>%
  add_row(
    variable_name = 'addr_state' ,
    description = 'The state provided by the borrower in the loan application',
    inModel = TRUE,
    inPrediction = FALSE,
    inPricing = FALSE
  ) %>%
  add_row(
    variable_name = 'dti' ,
    description = 'A ratio calculated using the borrower s total monthly debt payments on the total debt obligations, excluding mortgage and the requested LC loan, divided by the borrower s self-reported monthly income. NOT USED AS A VARIABLE. ONLY USE JOINT DTI.',
    inModel = FALSE,
    inPrediction = FALSE,
    inPricing = FALSE
  ) %>%
  add_row(
    variable_name = 'delinq_2yrs' ,
    description = 'The number of 30+ days past-due incidences of delinquency in the borrower s credit file for the past 2 years',
    inModel = TRUE,
    inPrediction = FALSE,
    inPricing = FALSE
  ) %>%
  add_row(
    variable_name = 'earliest_cr_line' ,
    description = 'The month the borrower s earliest reported credit line was opened',
    inModel = TRUE,
    inPrediction = FALSE,
    inPricing = FALSE
  ) %>%
  add_row(
    variable_name = 'inq_last_6mths' ,
    description = 'The number of inquiries in past 6 months (excluding auto and mortgage inquiries)',
    inModel = TRUE,
    inPrediction = FALSE,
    inPricing = FALSE
  ) %>%
  add_row(
    variable_name = 'mths_since_last_delinq' ,
    description = 'The number of months since the borrower s last delinquency.',
    inModel = TRUE,
    inPrediction = FALSE,
    inPricing = FALSE
  ) %>%
  add_row(
    variable_name = 'mths_since_last_record' ,
    description = 'The number of months since the last public record.',
    inModel = TRUE,
    inPrediction = FALSE,
    inPricing = FALSE
  ) %>%
  add_row(
    variable_name = 'open_acc' ,
    description = 'The number of open credit lines in the borrower s credit file.',
    inModel = TRUE,
    inPrediction = FALSE,
    inPricing = FALSE
  ) %>%
  add_row(
    variable_name = 'pub_rec' ,
    description = 'Number of derogatory public records',
    inModel = TRUE,
    inPrediction = FALSE,
    inPricing = FALSE
  ) %>%
  add_row(
    variable_name = 'revol_bal' ,
    description = 'Total credit revolving balance',
    inModel = TRUE,
    inPrediction = FALSE,
    inPricing = FALSE
  ) %>%
  add_row(
    variable_name = 'revol_util' ,
    description = 'Revolving line utilization rate, or the amount of credit the borrower is using relative to all available revolving credit.',
    inModel = TRUE,
    inPrediction = FALSE,
    inPricing = FALSE
  ) %>%
  add_row(
    variable_name = 'total_acc' ,
    description = 'The total number of credit lines currently in the borrower s credit file',
    inModel = TRUE,
    inPrediction = FALSE,
    inPricing = FALSE
  ) %>%
  add_row(
    variable_name = 'initial_list_status' ,
    description = 'The initial listing status of the loan. Possible values are – W, F',
    inModel = FALSE,
    inPrediction = FALSE,
    inPricing = FALSE
  ) %>%
  add_row(
    variable_name = 'out_prncp' ,
    description = 'Remaining outstanding principal for total amount funded. NOTE ONCE A LOAN IS REPAID OR CHARGED OFF, THIS AMOUNT BECOMES 0. ',
    inModel = FALSE,
    inPrediction = FALSE,
    inPricing = FALSE
  ) %>%
  add_row(
    variable_name = 'out_prncp_inv' ,
    description = 'Remaining outstanding principal for portion of total amount funded by investors. NOTE ONCE A LOAN IS REPAID OR CHARGED OFF, THIS AMOUNT BECOMES 0. ',
    inModel = FALSE,
    inPrediction = FALSE,
    inPricing = FALSE
  ) %>%
  add_row(
    variable_name = 'total_pymnt' ,
    description = 'Payments received to date for total amount funded',
    inModel = FALSE,
    inPrediction = TRUE,
    inPricing = TRUE
  ) %>%
  add_row(
    variable_name = 'total_pymnt_inv' ,
    description = 'Payments received to date for portion of total amount funded by investors',
    inModel = FALSE,
    inPrediction = TRUE,
    inPricing = TRUE
  ) %>%
  add_row(
    variable_name = 'total_rec_prncp' ,
    description = 'Principal received to date. NOTE THIS AMOUNT WILL SHOW WHETHER A BORROWER DID NOT REPAY IN FULL',
    inModel = FALSE,
    inPrediction = TRUE,
    inPricing = TRUE
  ) %>%
  add_row(
    variable_name = 'total_rec_int' ,
    description = 'Interest received to date',
    inModel = FALSE,
    inPrediction = TRUE,
    inPricing = TRUE
  ) %>%
  add_row(
    variable_name = 'total_rec_late_fee' ,
    description = 'Late fees received to date',
    inModel = FALSE,
    inPrediction = TRUE,
    inPricing = TRUE
  ) %>%
  add_row(
    variable_name = 'recoveries' ,
    description = 'Post charge off gross recovery. NOTE IF A LOAN IS REPAID, THIS AMOUNT IS 0. ',
    inModel = FALSE,
    inPrediction = FALSE,
    inPricing = FALSE
  ) %>%
  add_row(
    variable_name = 'collection_recovery_fee' ,
    description = 'Post charge off collection fee',
    inModel = FALSE,
    inPrediction = TRUE,
    inPricing = TRUE
  ) %>%
  add_row(
    variable_name = 'last_pymnt_d' ,
    description = 'Last month payment was received',
    inModel = FALSE,
    inPrediction = TRUE,
    inPricing = TRUE
  ) %>%
  add_row(
    variable_name = 'last_pymnt_amnt' ,
    description = 'Last total payment amount received',
    inModel = FALSE,
    inPrediction = TRUE,
    inPricing = TRUE
  ) %>%
  add_row(
    variable_name = 'next_pymnt_d' ,
    description = 'Next scheduled payment date',
    inModel = FALSE,
    inPrediction = FALSE,
    inPricing = TRUE
  ) %>%
  add_row(
    variable_name = 'last_credit_pull_d' ,
    description = 'The most recent month LC pulled credit for this loan',
    inModel = FALSE,
    inPrediction = FALSE,
    inPricing = FALSE
  ) %>%
  add_row(
    variable_name = 'collections_12_mths_ex_med' ,
    description = 'Number of collections in 12 months excluding medical collections',
    inModel = FALSE,
    inPrediction = FALSE,
    inPricing = FALSE
  ) %>%
  add_row(
    variable_name = 'mths_since_last_major_derog' ,
    description = 'Months since most recent 90-day or worse rating',
    inModel = FALSE,
    inPrediction = FALSE,
    inPricing = FALSE
  ) %>%
  add_row(
    variable_name = 'policy_code' ,
    description = 'Publicly available policy_code=1 / New products not publicly available policy_code=2',
    inModel = FALSE,
    inPrediction = FALSE,
    inPricing = FALSE
  ) %>%
  add_row(
    variable_name = 'application_type' ,
    description = 'Indicates whether the loan is an individual application or a joint application with two coborrowers',
    inModel = TRUE,
    inPrediction = FALSE,
    inPricing = FALSE
  ) %>%
  add_row(
    variable_name = 'annual_inc_joint' ,
    description = 'The combined self-reported annual income provided by the coborrowers during registration',
    inModel = TRUE,
    inPrediction = FALSE,
    inPricing = FALSE
  ) %>%
  add_row(
    variable_name = 'dti_joint' ,
    description = 'A ratio calculated using the coborrowers total monthly payments on the total debt obligations, excluding mortgages and the requested LC loan, divided by the coborrowers combined self-reported monthly income',
    inModel = TRUE,
    inPrediction = FALSE,
    inPricing = FALSE
  ) %>%
  add_row(
    variable_name = 'verification_status_joint' ,
    description = 'Indicates if income was verified by LC, not verified, or if the income source was verified',
    inModel = TRUE,
    inPrediction = FALSE,
    inPricing = FALSE
  ) %>%
  add_row(
    variable_name = 'acc_now_delinq' ,
    description = 'The number of accounts on which the borrower is now delinquent.',
    inModel = TRUE,
    inPrediction = FALSE,
    inPricing = TRUE
  ) %>%
  add_row(
    variable_name = 'tot_coll_amt' ,
    description = 'Total collection amounts ever owed',
    inModel = FALSE,
    inPrediction = FALSE,
    inPricing = TRUE
  ) %>%
  add_row(
    variable_name = 'tot_cur_bal' ,
    description = 'Total current balance of all accounts',
    inModel = FALSE,
    inPrediction = FALSE,
    inPricing = TRUE
  ) %>%
  add_row(
    variable_name = 'open_acc_6m' ,
    description = 'Number of open trades in last 6 months',
    inModel = FALSE,
    inPrediction = FALSE,
    inPricing = FALSE
  ) %>%
  add_row(
    variable_name = 'open_act_il' ,
    description = 'Number of currently active installment trades',
    inModel = FALSE,
    inPrediction = FALSE,
    inPricing = FALSE
  ) %>%
  add_row(
    variable_name = 'open_il_12m' ,
    description = 'Number of installment accounts opened in past 12 months',
    inModel = FALSE,
    inPrediction = FALSE,
    inPricing = TRUE
  ) %>%
  add_row(
    variable_name = 'open_il_24m' ,
    description = 'Number of installment accounts opened in past 24 months',
    inModel = FALSE,
    inPrediction = FALSE,
    inPricing = TRUE
  ) %>%
  add_row(
    variable_name = 'mths_since_rcnt_il' ,
    description = 'Months since most recent instalment accounts opened',
    inModel = FALSE,
    inPrediction = FALSE,
    inPricing = TRUE
  ) %>%
  add_row(
    variable_name = 'total_bal_il' ,
    description = 'Total current balance of all installment accounts',
    inModel = FALSE,
    inPrediction = FALSE,
    inPricing = TRUE
  ) %>%
  add_row(
    variable_name = 'il_util' ,
    description = 'Ratio of total current balance to high credit/credit limit on all install acct',
    inModel = FALSE,
    inPrediction = FALSE,
    inPricing = TRUE
  ) %>%
  add_row(
    variable_name = 'open_rv_12m' ,
    description = 'Number of revolving trades opened in past 12 months',
    inModel = TRUE,
    inPrediction = FALSE,
    inPricing = FALSE
  ) %>%
  add_row(
    variable_name = 'open_rv_24m' ,
    description = 'Number of revolving trades opened in past 24 months',
    inModel = TRUE,
    inPrediction = FALSE,
    inPricing = FALSE
  ) %>%
  add_row(
    variable_name = 'max_bal_bc' ,
    description = 'Maximum current balance owed on all revolving accounts',
    inModel = TRUE,
    inPrediction = FALSE,
    inPricing = TRUE
  ) %>%
  add_row(
    variable_name = 'all_util' ,
    description = 'Balance to credit limit on all trades',
    inModel = FALSE,
    inPrediction = FALSE,
    inPricing = TRUE
  ) %>%
  add_row(
    variable_name = 'total_rev_hi_lim' ,
    description = 'Total revolving high credit/credit limit',
    inModel = FALSE,
    inPrediction = FALSE,
    inPricing = TRUE
  ) %>%
  add_row(
    variable_name = 'inq_fi' ,
    description = 'Number of personal finance inquiries',
    inModel = TRUE,
    inPrediction = FALSE,
    inPricing = FALSE
  ) %>%
  add_row(
    variable_name = 'total_cu_tl' ,
    description = 'Number of finance trades',
    inModel = FALSE,
    inPrediction = FALSE,
    inPricing = FALSE
  ) %>%
  add_row(
    variable_name = 'inq_last_12m' ,
    description = 'Number of credit inquiries in past 12 months',
    inModel = FALSE,
    inPrediction = FALSE,
    inPricing = FALSE
  ) %>%
  add_row(
    variable_name = 'acc_open_past_24mths' ,
    description = 'Number of trades opened in past 24 months.',
    inModel = FALSE,
    inPrediction = FALSE,
    inPricing = FALSE
  ) %>%
  add_row(
    variable_name = 'avg_cur_bal' ,
    description = 'Average current balance of all accounts',
    inModel = TRUE,
    inPrediction = FALSE,
    inPricing = TRUE
  ) %>%
  add_row(
    variable_name = 'bc_open_to_buy' ,
    description = 'Total open to buy on revolving bankcards.',
    inModel = TRUE,
    inPrediction = FALSE,
    inPricing = FALSE
  ) %>%
  add_row(
    variable_name = 'bc_util' ,
    description = 'Ratio of total current balance to high credit/credit limit for all bankcard accounts.',
    inModel = TRUE,
    inPrediction = FALSE,
    inPricing = TRUE
  ) %>%
  add_row(
    variable_name = 'chargeoff_within_12_mths' ,
    description = 'Number of charge-offs within 12 months',
    inModel = FALSE,
    inPrediction = TRUE,
    inPricing = FALSE
  ) %>%
  add_row(
    variable_name = 'delinq_amnt' ,
    description = 'The past-due amount owed for the accounts on which the borrower is now delinquent.',
    inModel = FALSE,
    inPrediction = TRUE,
    inPricing = TRUE
  ) %>%
  add_row(
    variable_name = 'mo_sin_old_il_acct' ,
    description = 'Months since oldest bank instalment account opened',
    inModel = TRUE,
    inPrediction = FALSE,
    inPricing = FALSE
  ) %>%
  add_row(
    variable_name = 'mo_sin_old_rev_tl_op' ,
    description = 'Months since oldest revolving account opened',
    inModel = TRUE,
    inPrediction = FALSE,
    inPricing = FALSE
  ) %>%
  add_row(
    variable_name = 'mo_sin_rcnt_rev_tl_op' ,
    description = 'Months since most recent revolving account opened',
    inModel = TRUE,
    inPrediction = FALSE,
    inPricing = FALSE
  ) %>%
  add_row(
    variable_name = 'mo_sin_rcnt_tl' ,
    description = 'Months since most recent account opened',
    inModel = TRUE,
    inPrediction = FALSE,
    inPricing = FALSE
  ) %>%
  add_row(
    variable_name = 'mort_acc' ,
    description = 'Number of mortgage accounts.',
    inModel = TRUE,
    inPrediction = FALSE,
    inPricing = FALSE
  ) %>%
  add_row(
    variable_name = 'mths_since_recent_bc' ,
    description = 'Months since most recent bankcard account opened.',
    inModel = TRUE,
    inPrediction = FALSE,
    inPricing = FALSE
  ) %>%
  add_row(
    variable_name = 'mths_since_recent_bc_dlq' ,
    description = 'Months since most recent bankcard delinquency',
    inModel = TRUE,
    inPrediction = FALSE,
    inPricing = FALSE
  ) %>%
  add_row(
    variable_name = 'mths_since_recent_inq' ,
    description = 'Months since most recent inquiry.',
    inModel = TRUE,
    inPrediction = FALSE,
    inPricing = FALSE
  ) %>%
  add_row(
    variable_name = 'mths_since_recent_revol_delinq' ,
    description = 'Months since most recent revolving delinquency.',
    inModel = TRUE,
    inPrediction = FALSE,
    inPricing = FALSE
  ) %>%
  add_row(
    variable_name = 'num_accts_ever_120_pd' ,
    description = 'Number of accounts ever 120 or more days past due',
    inModel = TRUE,
    inPrediction = FALSE,
    inPricing = FALSE
  ) %>%
  add_row(
    variable_name = 'num_actv_bc_tl' ,
    description = 'Number of currently active bankcard accounts',
    inModel = TRUE,
    inPrediction = FALSE,
    inPricing = FALSE
  ) %>%
  add_row(
    variable_name = 'num_actv_rev_tl' ,
    description = 'Number of currently active revolving trades',
    inModel = TRUE,
    inPrediction = FALSE,
    inPricing = FALSE
  ) %>%
  add_row(
    variable_name = 'num_bc_sats' ,
    description = 'Number of satisfactory bankcard accounts',
    inModel = TRUE,
    inPrediction = FALSE,
    inPricing = FALSE
  ) %>%
  add_row(
    variable_name = 'num_bc_tl' ,
    description = 'Number of bankcard accounts',
    inModel = TRUE,
    inPrediction = FALSE,
    inPricing = FALSE
  ) %>%
  add_row(
    variable_name = 'num_il_tl' ,
    description = 'Number of installment accounts',
    inModel = TRUE,
    inPrediction = FALSE,
    inPricing = FALSE
  ) %>%
  add_row(
    variable_name = 'num_op_rev_tl' ,
    description = 'Number of open revolving accounts',
    inModel = TRUE,
    inPrediction = FALSE,
    inPricing = FALSE
  ) %>%
  add_row(
    variable_name = 'num_rev_accts' ,
    description = 'Number of revolving accounts',
    inModel = TRUE,
    inPrediction = FALSE,
    inPricing = FALSE
  ) %>%
  add_row(
    variable_name = 'num_rev_tl_bal_gt_0' ,
    description = 'Number of revolving trades with balance >0',
    inModel = TRUE,
    inPrediction = FALSE,
    inPricing = FALSE
  ) %>%
  add_row(
    variable_name = 'num_sats' ,
    description = 'Number of satisfactory accounts',
    inModel = TRUE,
    inPrediction = FALSE,
    inPricing = FALSE
  ) %>%
  add_row(
    variable_name = 'num_tl_120dpd_2m' ,
    description = 'Number of accounts currently 120 days past due (updated in past 2 months)',
    inModel = TRUE,
    inPrediction = FALSE,
    inPricing = FALSE
  ) %>%
  add_row(
    variable_name = 'num_tl_30dpd' ,
    description = 'Number of accounts currently 30 days past due (updated in past 2 months)',
    inModel = TRUE,
    inPrediction = FALSE,
    inPricing = FALSE
  ) %>%
  add_row(
    variable_name = 'num_tl_90g_dpd_24m' ,
    description = 'Number of accounts 90 or more days past due in last 24 months',
    inModel = TRUE,
    inPrediction = FALSE,
    inPricing = FALSE
  ) %>%
  add_row(
    variable_name = 'num_tl_op_past_12m' ,
    description = 'Number of accounts opened in past 12 months',
    inModel = TRUE,
    inPrediction = FALSE,
    inPricing = FALSE
  ) %>%
  add_row(
    variable_name = 'pct_tl_nvr_dlq' ,
    description = 'Percent of trades never delinquent',
    inModel = TRUE,
    inPrediction = FALSE,
    inPricing = FALSE
  ) %>%
  add_row(
    variable_name = 'percent_bc_gt_75' ,
    description = 'Percentage of all bankcard accounts > 75% of limit.',
    inModel = TRUE,
    inPrediction = FALSE,
    inPricing = FALSE
  ) %>%
  add_row(
    variable_name = 'pub_rec_bankruptcies' ,
    description = 'Number of public record bankruptcies',
    inModel = TRUE,
    inPrediction = FALSE,
    inPricing = FALSE
  ) %>%
  add_row(
    variable_name = 'tax_liens' ,
    description = 'Number of tax liens',
    inModel = TRUE,
    inPrediction = FALSE,
    inPricing = FALSE
  ) %>%
  add_row(
    variable_name = 'tot_hi_cred_lim' ,
    description = 'Total high credit/credit limit',
    inModel = TRUE,
    inPrediction = FALSE,
    inPricing = FALSE
  ) %>%
  add_row(
    variable_name = 'total_bal_ex_mort' ,
    description = 'Total credit balance excluding mortgage',
    inModel = TRUE,
    inPrediction = FALSE,
    inPricing = FALSE
  ) %>%
  add_row(
    variable_name = 'total_bc_limit' ,
    description = 'Total bankcard high credit/credit limit',
    inModel = TRUE,
    inPrediction = FALSE,
    inPricing = FALSE
  ) %>%
  add_row(
    variable_name = 'total_il_high_credit_limit' ,
    description = 'Total installment high credit/credit limit',
    inModel = TRUE,
    inPrediction = FALSE,
    inPricing = FALSE
  ) %>%
  add_row(
    variable_name = 'revol_bal_joint' ,
    description = 'Total credit revolving balance',
    inModel = TRUE,
    inPrediction = FALSE,
    inPricing = FALSE
  ) %>%
  add_row(
    variable_name = 'sec_app_earliest_cr_line' ,
    description = 'Earliest credit line at time of application for the secondary applicant. VARIABLE NOT USED. WE RELY ON THE MAIN BORROWER IN THE FIRST INSTANCE.',
    inModel = FALSE,
    inPrediction = FALSE,
    inPricing = FALSE
  ) %>%
  add_row(
    variable_name = 'sec_app_inq_last_6mths' ,
    description = 'Credit inquiries in the last 6 months at time of application for the secondary applicant. VARIABLE NOT USED. WE RELY ON THE MAIN BORROWER IN THE FIRST INSTANCE.',
    inModel = FALSE,
    inPrediction = FALSE,
    inPricing = FALSE
  ) %>%
  add_row(
    variable_name = 'sec_app_mort_acc' ,
    description = 'Number of mortgage accounts at time of application for the secondary applicant. VARIABLE NOT USED. WE RELY ON THE MAIN BORROWER IN THE FIRST INSTANCE.',
    inModel = FALSE,
    inPrediction = FALSE,
    inPricing = FALSE
  ) %>%
  add_row(
    variable_name = 'sec_app_open_acc' ,
    description = 'Number of open trades at time of application for the secondary applicant. VARIABLE NOT USED. WE RELY ON THE MAIN BORROWER IN THE FIRST INSTANCE.',
    inModel = FALSE,
    inPrediction = FALSE,
    inPricing = FALSE
  ) %>%
  add_row(
    variable_name = 'sec_app_revol_util' ,
    description = 'Ratio of total current balance to high credit/credit limit for all revolving accounts. VARIABLE NOT USED. WE RELY ON THE MAIN BORROWER IN THE FIRST INSTANCE.',
    inModel = FALSE,
    inPrediction = FALSE,
    inPricing = FALSE
  ) %>%
  add_row(
    variable_name = 'sec_app_open_act_il' ,
    description = 'Number of currently active installment trades at time of application for the secondary applicant. VARIABLE NOT USED. WE RELY ON THE MAIN BORROWER IN THE FIRST INSTANCE.',
    inModel = FALSE,
    inPrediction = FALSE,
    inPricing = FALSE
  ) %>%
  add_row(
    variable_name = 'sec_app_num_rev_accts' ,
    description = 'Number of revolving accounts at time of application for the secondary applicant. VARIABLE NOT USED. WE RELY ON THE MAIN BORROWER IN THE FIRST INSTANCE.',
    inModel = FALSE,
    inPrediction = FALSE,
    inPricing = FALSE
  ) %>%
  add_row(
    variable_name = 'sec_app_chargeoff_within_12_mths' ,
    description = 'Number of charge-offs within last 12 months at time of application for the secondary applicant. VARIABLE NOT USED. WE RELY ON THE MAIN BORROWER IN THE FIRST INSTANCE.',
    inModel = FALSE,
    inPrediction = FALSE,
    inPricing = FALSE
  ) %>%
  add_row(
    variable_name = 'sec_app_collections_12_mths_ex_med' ,
    description = 'Number of collections within last 12 months excluding medical collections at time of application for the secondary applicant. VARIABLE NOT USED. WE RELY ON THE MAIN BORROWER IN THE FIRST INSTANCE.',
    inModel = FALSE,
    inPrediction = FALSE,
    inPricing = FALSE
  ) %>%
  add_row(
    variable_name = 'sec_app_mths_since_last_major_derog' ,
    description = 'Months since most recent 90-day or worse rating at time of application for the secondary applicant. VARIABLE NOT USED. WE RELY ON THE MAIN BORROWER IN THE FIRST INSTANCE.',
    inModel = FALSE,
    inPrediction = FALSE,
    inPricing = FALSE
  ) %>%
  add_row(
    variable_name = 'hardship_flag' ,
    description = 'Flags whether or not the borrower is on a hardship plan',
    inModel = FALSE,
    inPrediction = FALSE,
    inPricing = TRUE
  ) %>%
  add_row(
    variable_name = 'hardship_type' ,
    description = 'Describes the hardship plan offering',
    inModel = FALSE,
    inPrediction = FALSE,
    inPricing = FALSE
  ) %>%
  add_row(
    variable_name = 'hardship_reason' ,
    description = 'Describes the reason the hardship plan was offered',
    inModel = FALSE,
    inPrediction = FALSE,
    inPricing = FALSE
  ) %>%
  add_row(
    variable_name = 'hardship_status' ,
    description = 'Describes if the hardship plan is active, pending, cancelled, completed, or broken',
    inModel = FALSE,
    inPrediction = FALSE,
    inPricing = FALSE
  ) %>%
  add_row(
    variable_name = 'deferral_term' ,
    description = 'Amount of months that the borrower is expected to pay less than the contractual monthly payment amount due to a hardship plan',
    inModel = FALSE,
    inPrediction = TRUE,
    inPricing = TRUE
  ) %>%
  add_row(
    variable_name = 'hardship_amount' ,
    description = 'The interest payment that the borrower has committed to make each month while they are on a hardship plan',
    inModel = FALSE,
    inPrediction = FALSE,
    inPricing = TRUE
  ) %>%
  add_row(
    variable_name = 'hardship_start_date' ,
    description = 'The start date of the hardship plan period',
    inModel = FALSE,
    inPrediction = FALSE,
    inPricing = TRUE
  ) %>%
  add_row(
    variable_name = 'hardship_end_date' ,
    description = 'The end date of the hardship plan period',
    inModel = FALSE,
    inPrediction = FALSE,
    inPricing = TRUE
  ) %>%
  add_row(
    variable_name = 'payment_plan_start_date' ,
    description = 'The day the first hardship plan payment is due. For example, if a borrower has a hardship plan period of 3 months, the start date is the start of the three-month period in which the borrower is allowed to make interest-only payments.',
    inModel = FALSE,
    inPrediction = FALSE,
    inPricing = TRUE
  ) %>%
  add_row(
    variable_name = 'hardship_length' ,
    description = 'The number of months the borrower will make smaller payments than normally obligated due to a hardship plan',
    inModel = FALSE,
    inPrediction = FALSE,
    inPricing = TRUE
  ) %>%
  add_row(
    variable_name = 'hardship_dpd' ,
    description = 'Account days past due as of the hardship plan start date',
    inModel = FALSE,
    inPrediction = FALSE,
    inPricing = TRUE
  ) %>%
  add_row(
    variable_name = 'hardship_loan_status' ,
    description = 'Loan Status as of the hardship plan start date',
    inModel = FALSE,
    inPrediction = FALSE,
    inPricing = TRUE
  ) %>%
  add_row(
    variable_name = 'orig_projected_additional_accrued_interest' ,
    description = 'The original projected additional interest amount that will accrue for the given hardship payment plan as of the Hardship Start Date. This field will be null if the borrower has broken their hardship payment plan.',
    inModel = FALSE,
    inPrediction = FALSE,
    inPricing = TRUE
  ) %>%
  add_row(
    variable_name = 'hardship_payoff_balance_amount' ,
    description = 'The payoff balance amount as of the hardship plan start date',
    inModel = FALSE,
    inPrediction = FALSE,
    inPricing = TRUE
  ) %>%
  add_row(
    variable_name = 'hardship_last_payment_amount' ,
    description = 'The last payment amount as of the hardship plan start date',
    inModel = FALSE,
    inPrediction = FALSE,
    inPricing = TRUE
  ) %>%
  add_row(
    variable_name = 'disbursement_method' ,
    description = 'The method by which the borrower receives their loan. Possible values are: CASH, DIRECT_PAY',
    inModel = TRUE,
    inPrediction = FALSE,
    inPricing = FALSE
  ) %>%
  add_row(
    variable_name = 'debt_settlement_flag' ,
    description = 'Flags whether or not the borrower, who has charged-off, is working with a debt-settlement company.',
    inModel = FALSE,
    inPrediction = FALSE,
    inPricing = FALSE
  ) %>%
  add_row(
    variable_name = 'debt_settlement_flag_date' ,
    description = 'The most recent date that the Debt_Settlement_Flag has been set  ',
    inModel = FALSE,
    inPrediction = FALSE,
    inPricing = FALSE
  ) %>%
  add_row(
    variable_name = 'settlement_status' ,
    description = 'The status of the borrower’s settlement plan. Possible values are: COMPLETE, ACTIVE, BROKEN, CANCELLED, DENIED, DRAFT',
    inModel = FALSE,
    inPrediction = FALSE,
    inPricing = FALSE
  ) %>%
  add_row(
    variable_name = 'settlement_date' ,
    description = 'The date that the borrower agrees to the settlement plan',
    inModel = FALSE,
    inPrediction = FALSE,
    inPricing = FALSE
  ) %>%
  add_row(
    variable_name = 'settlement_amount' ,
    description = 'The loan amount that the borrower has agreed to settle for',
    inModel = FALSE,
    inPrediction = FALSE,
    inPricing = FALSE
  ) %>%
  add_row(
    variable_name = 'settlement_percentage' ,
    description = 'The settlement amount as a percentage of the payoff balance amount on the loan',
    inModel = FALSE,
    inPrediction = FALSE,
    inPricing = FALSE
  ) %>%
  add_row(
    variable_name = 'settlement_term' ,
    description = 'The number of months that the borrower will be on the settlement plan',
    inModel = FALSE,
    inPrediction = FALSE,
    inPricing = FALSE
  )


In [4]:
# Select the variables checked in 01-startup.Rmd
varList <- c(LC_variable[LC_variable$inModel == TRUE, "variable_name"])$variable_name
varList <- c(varList, "grade_num", "sub_grade_num", "principal_loss_pct", "creditMargin", "monthDefault")

# Make sure that some variables are NOT in included in the training set
varRemove <- c("")

In [5]:
#################################################################################################
##
## Prepare a dataset with ONLY the predictors
##
loansPredictors <-
  loansWorkingSet %>%

  # Keep the chosen predictors
  # Use one_of() to avoid errors if column does not exist
  select(one_of(varList)) %>%
  select(-one_of(varRemove)) %>%

  # [TODO] FOR THE MOMENT UNTIL MACRODATA IS FIXED
  select(-one_of("addr_state")) %>%

  ##
  ## Dates to numeric, in 'decimal' years since 2000
  ##
  mutate_at(c("issue_d", "earliest_cr_line"), function(d) {
    return(year(d) - 2000 + (month(d) - 1) / 12)
  }) %>%

  ## Add polynomials of the dates to model the time-trend shape
  mutate(issue_d2 = issue_d ^ 2,
         issue_d3 = issue_d ^ 3,
         earliest_cr_line2 = earliest_cr_line ^ 2,
         earliest_cr_line3 = earliest_cr_line ^ 3) %>%

  # Fill missing values
  mutate_all(~replace(., is.na(.), 0))

“invalid factor level, NA generated”

In [6]:
#################################################################################################
##
## One-hot encoding of factors (in case model do not mix continuous and cat. variables)
##
modelX <-
  loansPredictors %>%

  # Binary categories into 0/1
  mutate(disbursement_method = if_else(disbursement_method == "Cash", 1, 0)) %>%
  rename(cashDisbursement = disbursement_method) %>%

  mutate(application_type = if_else(application_type == "Individual", 0, 1)) %>%
  rename(applicationJoint = application_type) %>%

  ## Widen the categorical datas
  # Verification
  mutate(verifiedSource      = if_else(verification_status       == "Source Verified", 1, 0),
         verified            = if_else(verification_status       == "Not Verified", 0, 1),
         verifiedJointSource = if_else(verification_status_joint == "Source Verified", 1, 0),
         verifiedJoint       = if_else(verification_status_joint %in% c("", "Not Verified"), 0, 1)) %>%
  select(-verification_status, -verification_status_joint) %>%

  # Home ownership
  mutate(homeANY = if_else(home_ownership == "ANY", 1, 0),
         homeMORTGAGE = if_else(home_ownership == "MORTGAGE", 1, 0),

         # Delete beacuse useless
         homeNONE = if_else(home_ownership == "NONE", 1, 0),
         homeOTHER = if_else(home_ownership == "OTHER", 1, 0),
         homeOWN = if_else(home_ownership == "OWN", 1, 0),
         homeRENT = if_else(home_ownership == "RENT", 1, 0)) %>%
  select(-home_ownership) %>%

  # Purpose
  mutate(purpCAR = if_else(purpose == "car", 1, 0),
         purpCREDITCARD = if_else(purpose == "credit_card", 1, 0),
         purpCDEBTCONSO = if_else(purpose == "debt_consolidation", 1, 0),
         purpEDUCATION = if_else(purpose == "educational", 1, 0),
         purpHOMEIMPROV = if_else(purpose == "home_improvement", 1, 0),
         purpHOUSE = if_else(purpose == "house", 1, 0),
         purpPURCHASE = if_else(purpose == "major_purchase", 1, 0),
         purpMEDICAL = if_else(purpose == "medical", 1, 0),
         purpMOVING = if_else(purpose == "moving", 1, 0),
         purpOTHER = if_else(purpose == "other", 1, 0),
         purpRENEWABLE = if_else(purpose == "renewable_energy", 1, 0),
         purpSMALLBUS = if_else(purpose == "small_business", 1, 0),
         purpVACATION = if_else(purpose == "vacation", 1, 0),
         purpWEDDING = if_else(purpose == "wedding", 1, 0)) %>%
  select(-purpose)

In [7]:
# Create training / test sets 80%/20%
proportionTraining <- 0.8
set.seed(42)
sampleTraining  <- sample(1:nSamples, floor(nSamples * proportionTraining), replace = FALSE)
loansTraining <- loansPredictors %>% slice( sampleTraining)
loansTest <-     loansPredictors %>% slice(-sampleTraining)
modelTraining <- modelX %>% slice( sampleTraining)
modelTest <-     modelX %>% slice(-sampleTraining)


# Subsets of the training set
set.seed(42)
nSamplesTraining <- nrow(loansTraining)
sample005 <- sample(1:nSamplesTraining,  floor(nSamplesTraining * 0.005), replace = FALSE)
sample01  <- sample(1:nSamplesTraining,  floor(nSamplesTraining * 0.01),  replace = FALSE)
sample05  <- sample(1:nSamplesTraining,  floor(nSamplesTraining * 0.05),  replace = FALSE)
sample10  <- sample(1:nSamplesTraining,  floor(nSamplesTraining * 0.10),  replace = FALSE)
sample20  <- sample(1:nSamplesTraining,  floor(nSamplesTraining * 0.20),  replace = FALSE)

loans005 <- loansTraining %>% slice(sample005)
loans01  <- loansTraining %>% slice(sample01)
loans05  <- loansTraining %>% slice(sample05)
loans10  <- loansTraining %>% slice(sample10)
loans20  <- loansTraining %>% slice(sample20)

model005 <- modelX %>% slice(sampleTraining) %>% slice(sample005)
model01  <- modelX %>% slice(sampleTraining) %>% slice(sample01)
model05  <- modelX %>% slice(sampleTraining) %>% slice(sample05)
model10  <- modelX %>% slice(sampleTraining) %>% slice(sample10)
model20  <- modelX %>% slice(sampleTraining) %>% slice(sample20)

G005 <- modelX %>% select(sub_grade_num) %>% slice(sample005)
G01  <- modelX %>% select(sub_grade_num) %>% slice(sample01)
G05  <- modelX %>% select(sub_grade_num) %>% slice(sample05)
G10  <- modelX %>% select(sub_grade_num) %>% slice(sample10)
G20  <- modelX %>% select(sub_grade_num) %>% slice(sample20)

loss005 <- loansTraining %>% select(principal_loss_pct) %>% slice(sample005)
loss01  <- loansTraining %>% select(principal_loss_pct) %>% slice(sample01)
loss05  <- loansTraining %>% select(principal_loss_pct) %>% slice(sample05)
loss10  <- loansTraining %>% select(principal_loss_pct) %>% slice(sample10)
loss20  <- loansTraining %>% select(principal_loss_pct) %>% slice(sample20)

CM005  <- loansTraining %>% select(creditMargin) %>% slice(sample005)
CM01   <- loansTraining %>% select(creditMargin) %>% slice(sample01)
CM05   <- loansTraining %>% select(creditMargin) %>% slice(sample05)
CM10   <- loansTraining %>% select(creditMargin) %>% slice(sample10)
CM20   <- loansTraining %>% select(creditMargin) %>% slice(sample20)

In [11]:
library(cluster)
library(factoextra)
library(caret)
library(xgboost)
library(doParallel)

# Attach again, otherwise slice is masked
library(dplyr)

Loading required package: foreach

Attaching package: ‘foreach’

The following objects are masked from ‘package:purrr’:

    accumulate, when

Loading required package: iterators
Loading required package: parallel


# RANDOM FOREST

Use Rborist since it accepts factors


In [None]:
trainKMEANS <- train(x = model01, y = Y01$sub_grade_num,
                     method = "knn",
                     tuneGrid = data.frame(k = seq(3, 9, 2)))

# Random Search
# set.seed(seed)
# rf_random <- train(Class~., data = dataset, method = "rf", metric = metric, trControl = control)
# print(rf_random)
# plot(rf_random)


# 17000 sec.
# Best result = 32 predictors
{
  tictoc::tic()
  cl <- makePSOCKcluster(4)
  registerDoParallel(cl)

  loansTmp <- loans005 %>% select(-creditMargin, principal_loss_pct)
  trainRF <- train(x = loansTmp, y = CM005$creditMargin,
                   method = "Rborist",
                   nSamp = 2500,
                   trControl = trainControl(method = "cv"))

  stopCluster(cl)
  tictoc::toc()

  print(trainRF)
  varImp(trainRF)
}
#################################################################################################
# Random Forest
#
# IF RESPONSE IS THE SUB-GRADE
#
# 13063 samples
# 63 predictor
#
# No pre-processing
# Resampling: Cross-Validated (10 fold, repeated 1 times)
# Summary of sample sizes: 11756, 11757, 11757, 11757, 11758, 11757, ...
# Resampling results across tuning parameters:
#
#   predFixed  RMSE       Rsquared   MAE
# 2         1.0217915  0.4787036  0.8070386
# 32         0.9217696  0.4984839  0.7231087
# 63         0.9235622  0.4944814  0.7241667
#
# Tuning parameter 'minNode' was held constant at a value of 3
# RMSE was used to select the optimal model using the smallest value.
# The final values used for the model were predFixed = 32 and minNode = 3.
#

#################################################################################################
# Random Forest
#
# IF RESPONSE IS THE PRINCIPAL LOSS (in %)
#
# 13063 samples
# 65 predictor
#
# No pre-processing
# Resampling: Cross-Validated (10 fold)
# Summary of sample sizes: 11756, 11757, 11757, 11758, 11756, 11756, ...
# Resampling results across tuning parameters:
#
#   predFixed  RMSE       Rsquared    MAE
# 2         0.2824622  0.09468031  0.2166095
# 33         0.2812666  0.09767208  0.2120904
# 65         0.2820000  0.09485043  0.2124572
#
# Tuning parameter 'minNode' was held constant at a value of 3
# RMSE was used to select the optimal model using the smallest value.
# The final values used for the model were predFixed = 33 and minNode = 3.
#
#
#################################################################################################
# Random Forest
#
# IF RESPONSE IS THE CREDIT MARGIN (in %)
# BUT: principal loss and credit margins were in the predictors
#
#
# 10450 samples
# 73 predictor
#
# No pre-processing
# Resampling: Cross-Validated (10 fold)
# Summary of sample sizes: 9404, 9405, 9405, 9405, 9405, 9405, ...
# Resampling results across tuning parameters:
#
#   predFixed  RMSE        Rsquared   MAE
# 2         0.38465362  0.7490979  0.189267621
# 37         0.04906856  0.9926129  0.013818872
# 73         0.01188224  0.9995688  0.001378399
#
# Tuning parameter 'minNode' was held constant at a value of 3
# RMSE was used to select the optimal model using the smallest value.
# The final values used for the model were predFixed = 73 and minNode = 3.
#
# 6642 sec.
#



# .
# Best result = 32 predictors



#################################################################################################
#
# 8008.441 sec elapsed
#
# Random Forest
#
# ON
#
# loansTmp <- loans005 %>% select(-creditMargin, principal_loss_pct)
# trainRF <- train(x = loansTmp, y = CM005$creditMargin,
#                  method = "Rborist",
#                  nSamp = 2500,
#                  trControl = trainControl(method = "cv"))
#
#
# 5225 samples
# 72 predictor
#
# No pre-processing
# Resampling: Cross-Validated (10 fold)
# Summary of sample sizes: 4704, 4702, 4704, 4702, 4701, 4703, ...
# Resampling results across tuning parameters:
#
#   predFixed  RMSE       Rsquared   MAE
# 2         0.4449010  0.4494987  0.21419547
# 37         0.1864721  0.8824442  0.09965741
# 72         0.1688334  0.9004621  0.09087179
#
# Tuning parameter 'minNode' was held constant at a value of 3
# RMSE was used to select the optimal model using the smallest value.
# The final values used for the model were predFixed = 72 and minNode = 3.
# Rborist variable importance
#
# only 20 most important variables shown (out of 72)
#
# Overall
# percent_bc_gt_75           100.0000
# term                         4.0345
# total_il_high_credit_limit   0.6672
# mo_sin_old_il_acct           0.5140
# pub_rec_bankruptcies         0.5113
# tax_liens                    0.4631
# num_tl_120dpd_2m             0.3957
# int_rate                     0.3929
# loan_amnt                    0.3790
# sub_grade_num                0.3747
# principal_loss_pct           0.3612
# open_rv_12m                  0.3385
# num_bc_tl                    0.3336
# avg_cur_bal                  0.2823
# acc_now_delinq               0.2798
# total_acc                    0.2701
# loanID                       0.2673
# inq_last_6mths               0.2381
# verification_status_joint    0.2249
# tot_hi_cred_lim              0.2033
# Warning message:
#   Setting row names on a tibble is deprecated.



# 17000 sec.
# Best result = 32 predictors

# Extreme Gradient Boosting

In [None]:
##############################################################
## 
## EXTREME GRADIENT BOOSTING
## 
{
  tictoc::tic()
  cl <- makePSOCKcluster(4)
  registerDoParallel(cl)

  loansTmp <- model005 %>% select(-creditMargin, principal_loss_pct, monthDefault)
  trainXGB <- train(x = loansTmp, y = CM005$creditMargin,
                    method="xgbTree",
                    trControl = trainControl(method = "cv"))

  stopCluster(cl)
  tictoc::toc()

  #confusionMatrix(predict(trainXGB, val_test_data[, -1]), val_test_data$outcome)

  print(trainXGB)
  varImp(trainXGB)
}

In [None]:
{
  require(caret)
  require(Rborist)
  require(doParallel)

  cl <- makePSOCKcluster(4)
  registerDoParallel(cl)

  tictoc::tic()

  trainRF <- train(x = loans01, y = Y01$sub_grade_num,
                   method = "Rborist",
                   predFixed = 32,
                   nSamp = 2500)

  tictoc::toc()
  stopCluster(cl)

  print(trainRF)
}



ggplot(trainRF)

cc <- cor(modelX)
ggcorrplot::ggcorrplot(cc, hc.order = TRUE, type = "upper", outline.color = "white")



tmp01 <-
  loans %>%
  slice(sample01) %>%
  select(funded_amnt, int_rate, installment, term, total_pymnt, total_rec_prncp)