In [20]:
import pandas as pd
import numpy as np
import dask.dataframe as dd
pd.set_option('display.max_columns', 500)
from scipy import stats

In [2]:
accepted_data = dd.read_csv('../data/accepted_2007_to_2018Q4.csv', dtype={'sec_app_earliest_cr_line':'object',
                                                                      'id':'object',
                                                                      'desc':'object'}, low_memory=False)
accepted_data.issue_d = accepted_data.issue_d.map_partitions(pd.to_datetime, meta=('issue_d', 'datetime64[ns]'))

In [3]:
# Keep only years between 2016 and 2018 inclusive
accepted_data = accepted_data[accepted_data.issue_d.dt.year.between(2016,2018, inclusive=True)]

In [4]:
# Remove Grade F and G
accepted_data = accepted_data[accepted_data.grade != 'F']
accepted_data = accepted_data[accepted_data.grade != 'G']

In [23]:
### IMPUTATION STRATEGIES  ###


# open_acc_6m
# Notes: 61 missing
# Strategy: impute with mean
accepted_data.open_acc_6m = accepted_data.open_acc_6m.fillna(np.mean(accepted_data.open_acc_6m))

# open_act_il
# Notes: 60 missing
# Strategy: Impute with mean
accepted_data.open_act_il = accepted_data.open_act_il.fillna(np.mean(accepted_data.open_act_il))

# open_il_12m
# Notes: 60 missing
# Strategy: Impute with mean
accepted_data.open_il_12m = accepted_data.open_il_12m.fillna(np.mean(accepted_data.open_il_12m))

# open_il_24m
# Notes: 60 missing
# Strategy: Impute with mean
accepted_data.open_il_24m = accepted_data.open_il_24m.fillna(np.mean(accepted_data.open_il_24m))

# open_rv_12m
# Notes: 60 missing
# Strategy: Impute with mean
accepted_data.open_rv_12m = accepted_data.open_rv_12m.fillna(np.mean(accepted_data.open_rv_12m))

# open_rv_24m
# Notes: 60 missing 
# Strategy: Impute with mean
accepted_data.open_rv_24m = accepted_data.open_rv_24m.fillna(np.mean(accepted_data.open_rv_24m))

# pct_tl_nvr_dlq
# Notes: 2 missing
# Strategy: Impute with mean
accepted_data.pct_tl_nvr_dlq = accepted_data.pct_tl_nvr_dlq.fillna(np.mean(accepted_data.pct_tl_nvr_dlq))

# percent_bc_gt_75
# Notes: 16540 missing (1% of the number of observations)
# Strategy: Impute with mean???
accepted_data.percent_bc_gt_75 = 

# pub_rec
# Notes: 0 missing
# Strategy: we can impute my mode if needed
accepted_data.pub_rec = accepted_data.pub_rec.fillna(stats.mode(accepted_data.pub_rec).mode[0])

# pub_rec_bankruptcies
# Notes: 0 missing
# Strategy: we could impute with mode if needed
accepted_data.pub_rec_bankruptcies = accepted_data.pub_rec_bankruptcies.fillna(stats.mode(accepted_data.pub_rec_bankruptcies).mode[0])

# purpose
# Notes: 0 missing
# Strategy: if missing, fill with 'No Info'
accepted_data.purpose = accepted_data.purpose.fillna('No Info')

# revol_bal
# Notes: 0 missing
# Strategy: Impute with meanif we need to
accepted_data.revol_bal = accepted_data.revol_bal.fillna(np.mean(accepted_data.revol_bal))

# revol_bal_joint
# Notes: 1242011 missing (92% of observations)
# Strategy: ???
accepted_data.revol_bal_joint = 

# revol_util
# Notes: 1263 missing
# Strategy: Impute with mean
accepted_data.revol_util = accepted_data.revol_util.fillna(np.mean(accepted_data.revol_util))


In [25]:
accepted_data.pub_rec = accepted_data.pub_rec.fillna(stats.mode(accepted_data.pub_rec).mode[0])

In [17]:
accepted_data[accepted_data.percent_bc_gt_75.isnull()].compute()

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,emp_title,emp_length,home_ownership,annual_inc,verification_status,issue_d,loan_status,pymnt_plan,url,desc,purpose,title,zip_code,addr_state,dti,delinq_2yrs,earliest_cr_line,fico_range_low,fico_range_high,inq_last_6mths,mths_since_last_delinq,mths_since_last_record,open_acc,pub_rec,revol_bal,revol_util,total_acc,initial_list_status,out_prncp,out_prncp_inv,total_pymnt,total_pymnt_inv,total_rec_prncp,total_rec_int,total_rec_late_fee,recoveries,collection_recovery_fee,last_pymnt_d,last_pymnt_amnt,next_pymnt_d,last_credit_pull_d,last_fico_range_high,last_fico_range_low,collections_12_mths_ex_med,mths_since_last_major_derog,policy_code,application_type,annual_inc_joint,dti_joint,verification_status_joint,acc_now_delinq,tot_coll_amt,tot_cur_bal,open_acc_6m,open_act_il,open_il_12m,open_il_24m,mths_since_rcnt_il,total_bal_il,il_util,open_rv_12m,open_rv_24m,max_bal_bc,all_util,total_rev_hi_lim,inq_fi,total_cu_tl,inq_last_12m,acc_open_past_24mths,avg_cur_bal,bc_open_to_buy,bc_util,chargeoff_within_12_mths,delinq_amnt,mo_sin_old_il_acct,mo_sin_old_rev_tl_op,mo_sin_rcnt_rev_tl_op,mo_sin_rcnt_tl,mort_acc,mths_since_recent_bc,mths_since_recent_bc_dlq,mths_since_recent_inq,mths_since_recent_revol_delinq,num_accts_ever_120_pd,num_actv_bc_tl,num_actv_rev_tl,num_bc_sats,num_bc_tl,num_il_tl,num_op_rev_tl,num_rev_accts,num_rev_tl_bal_gt_0,num_sats,num_tl_120dpd_2m,num_tl_30dpd,num_tl_90g_dpd_24m,num_tl_op_past_12m,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,revol_bal_joint,sec_app_fico_range_low,sec_app_fico_range_high,sec_app_earliest_cr_line,sec_app_inq_last_6mths,sec_app_mort_acc,sec_app_open_acc,sec_app_revol_util,sec_app_open_act_il,sec_app_num_rev_accts,sec_app_chargeoff_within_12_mths,sec_app_collections_12_mths_ex_med,sec_app_mths_since_last_major_derog,hardship_flag,hardship_type,hardship_reason,hardship_status,deferral_term,hardship_amount,hardship_start_date,hardship_end_date,payment_plan_start_date,hardship_length,hardship_dpd,hardship_loan_status,orig_projected_additional_accrued_interest,hardship_payoff_balance_amount,hardship_last_payment_amount,disbursement_method,debt_settlement_flag,debt_settlement_flag_date,settlement_status,settlement_date,settlement_amount,settlement_percentage,settlement_term
55601,130923856,,15350.0,15350.0,15350.0,36 months,6.07,467.47,A,A2,Compliance Manager,6 years,MORTGAGE,148000.0,Source Verified,2018-03-01,Current,n,https://lendingclub.com/browse/loanDetail.acti...,,debt_consolidation,Debt consolidation,282xx,NC,23.62,0.0,Sep-2003,760.0,764.0,0.0,,83.0,19.0,1.0,46635.0,26.6,33.0,w,10951.81,10951.81,5134.410000,5134.41,4398.19,736.22,0.00,0.0,0.0,Feb-2019,467.47,Apr-2019,Mar-2019,734.0,730.0,0.0,,1.0,Individual,,,,0.0,0.0,426610.0,0.0,12.0,0.0,3.0,21.0,129768.0,79.0,0.0,3.0,0.0,68.0,78400.0,0.0,6.0,0.0,7.0,22453.0,,,0.0,0.0,140.0,174.0,13.0,13.0,2.0,,,18.0,,0.0,0.0,6.0,0.0,2.0,23.0,6.0,8.0,6.0,19.0,0.0,0.0,0.0,0.0,100.0,,1.0,0.0,512289.0,176403.0,0.0,164889.0,,,,,,,,,,,,,,N,,,,,,,,,,,,,,,Cash,N,,,,,,
55602,130964787,,4600.0,4600.0,4600.0,36 months,9.92,148.26,B,B2,Technology Product Specilist,10+ years,RENT,70000.0,Not Verified,2018-03-01,Current,n,https://lendingclub.com/browse/loanDetail.acti...,,debt_consolidation,Debt consolidation,801xx,CO,7.63,0.0,Aug-2004,675.0,679.0,0.0,,92.0,5.0,1.0,9504.0,29.0,36.0,w,3336.05,3336.05,1625.790000,1625.79,1263.95,361.84,0.00,0.0,0.0,Feb-2019,148.26,Apr-2019,Mar-2019,719.0,715.0,0.0,,1.0,Individual,,,,0.0,0.0,9504.0,1.0,0.0,0.0,0.0,38.0,0.0,,1.0,4.0,0.0,29.0,32799.0,0.0,5.0,1.0,5.0,1901.0,,,0.0,0.0,163.0,157.0,5.0,5.0,0.0,,,8.0,,0.0,0.0,3.0,0.0,8.0,12.0,4.0,23.0,3.0,5.0,0.0,0.0,0.0,1.0,100.0,,1.0,0.0,32912.0,9504.0,0.0,0.0,,,,,,,,,,,,,,N,,,,,,,,,,,,,,,Cash,N,,,,,,
55652,130701401,,10000.0,10000.0,10000.0,36 months,5.31,301.11,A,A1,,,RENT,61056.0,Verified,2018-03-01,Current,n,https://lendingclub.com/browse/loanDetail.acti...,,credit_card,Credit card refinancing,117xx,NY,7.72,0.0,May-2011,755.0,759.0,0.0,,,8.0,0.0,8444.0,28.1,12.0,w,6480.58,6480.58,3908.530000,3908.53,3519.42,389.11,0.00,0.0,0.0,Feb-2019,301.11,Apr-2019,Mar-2019,729.0,725.0,0.0,,1.0,Individual,,,,0.0,0.0,14173.0,0.0,1.0,0.0,0.0,44.0,5729.0,42.0,1.0,6.0,0.0,33.0,30100.0,0.0,2.0,1.0,6.0,2025.0,,,0.0,0.0,59.0,82.0,9.0,9.0,0.0,82.0,,9.0,,0.0,0.0,1.0,1.0,1.0,2.0,7.0,10.0,1.0,8.0,0.0,0.0,0.0,1.0,100.0,,0.0,0.0,43599.0,14173.0,0.0,13499.0,,,,,,,,,,,,,,N,,,,,,,,,,,,,,,DirectPay,N,,,,,,
55657,129902943,,3000.0,3000.0,3000.0,36 months,11.98,99.62,B,B5,Food Service,10+ years,OWN,52000.0,Source Verified,2018-03-01,Current,n,https://lendingclub.com/browse/loanDetail.acti...,,debt_consolidation,Debt consolidation,042xx,ME,0.00,0.0,Jun-1992,750.0,754.0,0.0,,,1.0,0.0,0.0,,13.0,w,2194.21,2194.21,1091.830000,1091.83,805.79,286.04,0.00,0.0,0.0,Feb-2019,99.62,Apr-2019,Mar-2019,759.0,755.0,0.0,,1.0,Individual,,,,0.0,0.0,74761.0,0.0,0.0,0.0,2.0,14.0,0.0,,0.0,0.0,0.0,,0.0,1.0,2.0,0.0,3.0,74761.0,,,0.0,0.0,121.0,309.0,81.0,14.0,2.0,,,14.0,,0.0,0.0,0.0,0.0,2.0,8.0,0.0,3.0,0.0,1.0,0.0,0.0,0.0,0.0,100.0,,0.0,0.0,80000.0,0.0,0.0,0.0,,,,,,,,,,,,,,N,,,,,,,,,,,,,,,Cash,N,,,,,,
55738,130952667,,6000.0,6000.0,6000.0,36 months,7.34,186.20,A,A4,Technical,10+ years,OWN,180000.0,Not Verified,2018-03-01,Current,n,https://lendingclub.com/browse/loanDetail.acti...,,debt_consolidation,Debt consolidation,430xx,OH,12.66,0.0,Jul-2002,735.0,739.0,0.0,81.0,,11.0,0.0,11690.0,41.6,26.0,f,4304.27,4304.27,2043.310000,2043.31,1695.73,347.58,0.00,0.0,0.0,Feb-2019,186.20,Apr-2019,Mar-2019,739.0,735.0,0.0,,1.0,Individual,,,,0.0,0.0,98868.0,2.0,4.0,2.0,4.0,1.0,87178.0,73.0,1.0,2.0,0.0,58.0,28100.0,1.0,1.0,1.0,6.0,8988.0,,,0.0,0.0,162.0,188.0,4.0,1.0,2.0,,,12.0,,0.0,0.0,7.0,0.0,2.0,15.0,7.0,9.0,7.0,11.0,0.0,0.0,0.0,3.0,92.3,,0.0,0.0,158833.0,98868.0,0.0,130733.0,,,,,,,,,,,,,,N,,,,,,,,,,,,,,,Cash,N,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14069,90404084,,35000.0,35000.0,35000.0,60 months,17.99,888.58,D,D2,Upkeep,10+ years,MORTGAGE,110000.0,Verified,2016-10-01,Late (31-120 days),n,https://lendingclub.com/browse/loanDetail.acti...,,debt_consolidation,Debt consolidation,630xx,MO,7.15,0.0,May-2002,670.0,674.0,0.0,75.0,,4.0,0.0,0.0,,11.0,w,23871.26,23871.26,24221.720000,24221.72,11128.74,12959.69,133.29,0.0,0.0,Mar-2019,933.01,Apr-2019,Mar-2019,749.0,745.0,0.0,79.0,1.0,Joint App,138000.0,12.35,Not Verified,0.0,0.0,95783.0,0.0,2.0,0.0,0.0,32.0,30903.0,64.0,0.0,0.0,0.0,64.0,0.0,0.0,1.0,1.0,0.0,31928.0,,,0.0,0.0,172.0,82.0,52.0,32.0,4.0,,,2.0,,1.0,0.0,0.0,0.0,0.0,5.0,1.0,2.0,0.0,4.0,0.0,0.0,0.0,0.0,90.9,,0.0,0.0,124470.0,30903.0,0.0,48314.0,,,,,,,,,,,,,,N,,,,,,,,,,,,,,,Cash,N,,,,,,
14076,90581463,,2000.0,2000.0,2000.0,36 months,7.59,62.30,A,A3,Clerk,10+ years,MORTGAGE,45000.0,Not Verified,2016-10-01,Current,n,https://lendingclub.com/browse/loanDetail.acti...,,home_improvement,Home improvement,620xx,IL,5.36,1.0,Oct-1992,700.0,704.0,1.0,3.0,,6.0,0.0,28433.0,85.6,31.0,w,389.24,389.24,1835.860000,1835.86,1610.76,225.10,0.00,0.0,0.0,Mar-2019,62.30,Apr-2019,Mar-2019,779.0,775.0,0.0,,1.0,Individual,,,,0.0,0.0,124925.0,0.0,0.0,0.0,0.0,147.0,0.0,,1.0,1.0,0.0,86.0,33200.0,0.0,5.0,1.0,2.0,20821.0,,,0.0,0.0,167.0,287.0,9.0,9.0,3.0,,3.0,5.0,3.0,0.0,0.0,3.0,0.0,11.0,4.0,5.0,24.0,3.0,6.0,0.0,0.0,0.0,1.0,87.1,,0.0,0.0,139550.0,28433.0,0.0,0.0,,,,,,,,,,,,,,N,,,,,,,,,,,,,,,Cash,N,,,,,,
14576,90238787,,35000.0,35000.0,35000.0,60 months,17.99,888.58,D,D2,Shop Floorman,10+ years,RENT,117000.0,Source Verified,2016-10-01,Current,n,https://lendingclub.com/browse/loanDetail.acti...,,debt_consolidation,Debt consolidation,590xx,MT,21.66,1.0,Nov-2003,680.0,684.0,1.0,15.0,99.0,7.0,2.0,7662.0,51.4,39.0,f,21342.48,21342.48,26357.310000,26357.31,13657.52,12699.79,0.00,0.0,0.0,Mar-2019,888.58,Apr-2019,Mar-2019,694.0,690.0,0.0,56.0,1.0,Individual,,,,0.0,66.0,87119.0,1.0,4.0,2.0,6.0,6.0,79457.0,88.0,0.0,1.0,0.0,78.0,14900.0,3.0,4.0,10.0,7.0,12446.0,,,0.0,0.0,154.0,138.0,21.0,6.0,3.0,,15.0,0.0,15.0,2.0,0.0,3.0,0.0,4.0,27.0,3.0,9.0,3.0,7.0,0.0,0.0,0.0,2.0,86.8,,0.0,2.0,104759.0,87119.0,0.0,89859.0,,,,,,,,,,,,,,N,,,,,,,,,,,,,,,Cash,N,,,,,,
14619,90229828,,25000.0,25000.0,24950.0,60 months,12.79,566.15,C,C1,Investment Officer,9 years,MORTGAGE,179000.0,Source Verified,2016-10-01,Fully Paid,n,https://lendingclub.com/browse/loanDetail.acti...,,debt_consolidation,Debt consolidation,201xx,VA,16.54,1.0,May-2007,675.0,679.0,0.0,5.0,,9.0,0.0,74976.0,82.2,24.0,f,0.00,0.00,28652.773716,28595.47,25000.00,3624.46,28.31,0.0,0.0,Dec-2017,21275.27,,Jan-2018,689.0,685.0,0.0,,1.0,Individual,,,,0.0,0.0,406136.0,1.0,4.0,2.0,5.0,5.0,64065.0,71.0,0.0,0.0,0.0,73.0,82000.0,0.0,16.0,0.0,5.0,45126.0,,,0.0,0.0,108.0,112.0,25.0,5.0,1.0,,,,,0.0,0.0,3.0,0.0,1.0,15.0,4.0,8.0,3.0,9.0,0.0,0.0,0.0,2.0,95.8,,0.0,0.0,480643.0,139041.0,0.0,90643.0,,,,,,,,,,,,,,N,,,,,,,,,,,,,,,Cash,N,,,,,,


In [30]:
print(sum(accepted_data.revol_util.isna()))


1263


In [13]:
print(len(accepted_data))

1347798
