In [1]:
import warnings

# Suppress the Deprecation Warnings.
warnings.filterwarnings('ignore', category=DeprecationWarning)

# Load in the necessary libraries.
import numpy as np
import pandas as pd

# Set the display option to show all rows without truncation.
pd.set_option('display.max_rows', None)

In [2]:
# Read in the data.
df = pd.read_csv('Resources/train.csv')

In [4]:
df.shape
row_count = len(df)

In [None]:
# Set seed for reproducibility.
np.random.seed(42)

In [None]:
# Observe the first five rows of the dataset.
df.head(5)

In [None]:
# Observe the last five rows of the dataset.
df.tail(5)

In [None]:
# Get the number of missing data points per column.
missing_values_count = df.isnull().sum()
missing_values_count

In [None]:
# How many total missing values do we have?
total_cells = np.product(df.shape)
total_missing = missing_values_count.sum()

# Percent of data that is missing.
percent_missing = (total_missing / total_cells) * 100
percent_missing

In [5]:
# Drop the columns where more than 50% of elements are missing.
df.dropna(axis='columns', thresh=row_count/2, inplace=True)
df.shape

(517788, 87)

In [6]:
# Get the number of missing data points per column.
missing_values_count = df.isnull().sum()
missing_values_count

loan_amnt                         0
funded_amnt                       0
funded_amnt_inv                   0
term                              0
int_rate                          0
installment                       0
grade                             0
sub_grade                         0
emp_title                     34051
emp_length                    31300
home_ownership                    0
annual_inc                        0
verification_status               0
issue_d                           0
pymnt_plan                        0
purpose                           0
title                          6617
zip_code                          1
addr_state                        0
dti                             132
delinq_2yrs                       0
earliest_cr_line                  0
inq_last_6mths                    0
open_acc                          0
pub_rec                           0
revol_bal                         0
revol_util                      321
total_acc                   

In [17]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 5)

# Observe the columns with missing values.
df[df.isnull().any(axis='columns')]

Unnamed: 0,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,emp_title,emp_length,home_ownership,annual_inc,verification_status,issue_d,pymnt_plan,purpose,title,zip_code,addr_state,dti,delinq_2yrs,earliest_cr_line,inq_last_6mths,open_acc,pub_rec,revol_bal,revol_util,total_acc,initial_list_status,out_prncp,out_prncp_inv,total_pymnt,total_pymnt_inv,total_rec_prncp,total_rec_int,total_rec_late_fee,recoveries,collection_recovery_fee,last_pymnt_d,last_pymnt_amnt,last_credit_pull_d,collections_12_mths_ex_med,policy_code,application_type,acc_now_delinq,tot_coll_amt,tot_cur_bal,total_rev_hi_lim,acc_open_past_24mths,avg_cur_bal,bc_open_to_buy,bc_util,chargeoff_within_12_mths,delinq_amnt,mo_sin_old_il_acct,mo_sin_old_rev_tl_op,mo_sin_rcnt_rev_tl_op,mo_sin_rcnt_tl,mort_acc,mths_since_recent_bc,mths_since_recent_inq,num_accts_ever_120_pd,num_actv_bc_tl,num_actv_rev_tl,num_bc_sats,num_bc_tl,num_il_tl,num_op_rev_tl,num_rev_accts,num_rev_tl_bal_gt_0,num_sats,num_tl_120dpd_2m,num_tl_30dpd,num_tl_90g_dpd_24m,num_tl_op_past_12m,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,hardship_flag,disbursement_method,debt_settlement_flag,loan_status
0,14000,14000,14000.0,36 months,7.35,434.53,A,A4,Paralegal,1 year,MORTGAGE,44000.0,Not Verified,Aug-2017,n,debt_consolidation,Debt consolidation,740xx,OK,34.70,0.0,Feb-2003,0.0,7.0,1.0,1067,5.4,18.0,f,0.0,0.0,15118.387808,15118.39,14000.0,1118.39,0.0,0.0,0.0,Dec-2018,8611.87,Dec-2018,0.0,1,Individual,0.0,0.0,23659.0,19900.0,2.0,3380.0,13147.0,3.3,0.0,0.0,62.0,174.0,80.0,9.0,0.0,171.0,,0.0,1.0,3.0,1.0,3.0,7.0,3.0,11.0,3.0,7.0,0.0,0.0,0.0,1.0,100.0,0.0,0.0,1.0,67943.0,23659.0,13600.0,48043.0,N,Cash,N,1
1,2000,2000,2000.0,36 months,16.29,70.61,D,D2,,,MORTGAGE,11420.0,Verified,Jul-2014,n,debt_consolidation,Debt consolidation,337xx,FL,20.61,0.0,Mar-1982,0.0,10.0,1.0,4168,42.5,21.0,w,0.0,0.0,2541.534174,2541.53,2000.0,541.53,0.0,0.0,0.0,Jul-2017,70.18,Jul-2017,0.0,1,Individual,0.0,0.0,4168.0,9800.0,6.0,417.0,91.0,93.0,0.0,0.0,144.0,388.0,6.0,6.0,0.0,9.0,9.0,0.0,1.0,4.0,1.0,6.0,2.0,10.0,19.0,4.0,10.0,0.0,0.0,0.0,4.0,93.7,100.0,1.0,0.0,9800.0,4168.0,1300.0,0.0,N,Cash,N,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
517780,10000,10000,10000.0,36 months,13.67,340.18,C,C3,,,MORTGAGE,38500.0,Verified,Jun-2016,n,debt_consolidation,Debt consolidation,320xx,FL,15.62,1.0,Oct-1988,2.0,10.0,1.0,4869,27.5,39.0,w,0.0,0.0,11168.683499,11168.68,10000.0,1168.68,0.0,0.0,0.0,May-2017,7774.47,Oct-2018,0.0,1,Individual,0.0,0.0,13504.0,17700.0,3.0,1500.0,891.0,75.9,0.0,0.0,166.0,332.0,4.0,3.0,3.0,4.0,2.0,0.0,2.0,5.0,3.0,20.0,9.0,8.0,27.0,5.0,10.0,0.0,0.0,0.0,2.0,94.1,50.0,1.0,0.0,33074.0,13504.0,3700.0,15374.0,N,Cash,N,1
517782,10000,10000,10000.0,36 months,12.12,332.72,B,B3,Copeland Oaks,2 years,RENT,23000.0,Not Verified,Apr-2012,n,debt_consolidation,Finally,444xx,OH,18.13,0.0,Aug-2006,1.0,6.0,0.0,5580,66.4,11.0,f,0.0,0.0,11977.780000,11977.78,10000.0,1977.78,0.0,0.0,0.0,Apr-2015,339.75,Feb-2019,0.0,1,Individual,0.0,,,,3.0,,2042.0,67.1,0.0,0.0,,,,,0.0,19.0,2.0,,,,,,,,,,,,,,,,50.0,0.0,0.0,,12136.0,6200.0,,N,Cash,N,1
