# Lending Club Case Study

## Import packages

In [51]:
#Import necessary packages
import chardet
import pandas as pd

## Load dataset

In [52]:
#check the encoding of input data:
with open('loan.csv','rb') as raw_data:
    result = chardet.detect(raw_data.read(2000))
print(result)

{'encoding': 'ascii', 'confidence': 1.0, 'language': ''}


In [53]:
#load the loan dataset
loan_data = pd.read_csv('loan.csv',dtype='unicode')

In [54]:
loan_data.head()

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,...,num_tl_90g_dpd_24m,num_tl_op_past_12m,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit
0,1077501,1296599,5000,5000,4975,36 months,10.65%,162.87,B,B2,...,,,,,0,0,,,,
1,1077430,1314167,2500,2500,2500,60 months,15.27%,59.83,C,C4,...,,,,,0,0,,,,
2,1077175,1313524,2400,2400,2400,36 months,15.96%,84.33,C,C5,...,,,,,0,0,,,,
3,1076863,1277178,10000,10000,10000,36 months,13.49%,339.31,C,C1,...,,,,,0,0,,,,
4,1075358,1311748,3000,3000,3000,60 months,12.69%,67.79,B,B5,...,,,,,0,0,,,,


In [55]:
loan_data.shape

(39717, 111)

## Data cleaning

In [56]:
loan_data.isnull().sum()

id                                0
member_id                         0
loan_amnt                         0
funded_amnt                       0
funded_amnt_inv                   0
                              ...  
tax_liens                        39
tot_hi_cred_lim               39717
total_bal_ex_mort             39717
total_bc_limit                39717
total_il_high_credit_limit    39717
Length: 111, dtype: int64

There are columns with many Null values. We can drop these rows as they won't be of use to us in the analysis. We will drop rows that contain more that 80 percent as null values.

In [57]:
null_percent = loan_data.isnull().sum() * 100 / len(loan_data)
null_percent

id                              0.000000
member_id                       0.000000
loan_amnt                       0.000000
funded_amnt                     0.000000
funded_amnt_inv                 0.000000
                                 ...    
tax_liens                       0.098195
tot_hi_cred_lim               100.000000
total_bal_ex_mort             100.000000
total_bc_limit                100.000000
total_il_high_credit_limit    100.000000
Length: 111, dtype: float64

In [58]:
#Get the columns that does has more than 80% null values. 
null_percent_df = pd.DataFrame({'column_name': loan_data.columns,
                               'null_percent': null_percent.round(2)})
(null_percent_df.null_percent > 80).sum()

56

There seems to be 56 columns with percent of null values exceeding 80 percent. We can drop these columns.

In [59]:
#get the column names to be dropped.
columns_to_drop = null_percent_df[null_percent_df.null_percent > 80].column_name.tolist()
master_frame = loan_data.drop(columns_to_drop, axis = 1)
master_frame.shape

(39717, 55)

In [60]:
#checking the percent of null after removal of null columns
new_null_percent = master_frame.isnull().sum() * 100 / len(master_frame)
new_null_percent[new_null_percent>0]

emp_title                      6.191303
emp_length                     2.706650
desc                          32.580507
title                          0.027696
mths_since_last_delinq        64.662487
revol_util                     0.125891
last_pymnt_d                   0.178765
last_credit_pull_d             0.005036
collections_12_mths_ex_med     0.140998
chargeoff_within_12_mths       0.140998
pub_rec_bankruptcies           1.754916
tax_liens                      0.098195
dtype: float64

In [61]:
master_frame['chargeoff_within_12_mths'].value_counts()

0    39661
Name: chargeoff_within_12_mths, dtype: int64

In [62]:
master_frame['tax_liens'].value_counts()

0    39678
Name: tax_liens, dtype: int64

In [63]:
master_frame['collections_12_mths_ex_med'].value_counts()

0    39661
Name: collections_12_mths_ex_med, dtype: int64

In [64]:
master_frame['policy_code'].value_counts()

1    39717
Name: policy_code, dtype: int64

In [65]:
master_frame['pymnt_plan'].value_counts()

n    39717
Name: pymnt_plan, dtype: int64

In [66]:
master_frame['delinq_amnt'].value_counts()

0    39717
Name: delinq_amnt, dtype: int64

In [67]:
master_frame['acc_now_delinq'].value_counts()

0    39717
Name: acc_now_delinq, dtype: int64

In [68]:
master_frame['application_type'].value_counts()

INDIVIDUAL    39717
Name: application_type, dtype: int64

In [69]:
master_frame['initial_list_status'].value_counts()

f    39717
Name: initial_list_status, dtype: int64

In [71]:
#dropping the columsn mentioned above
drop_list = ['desc','tax_liens','chargeoff_within_12_mths','collections_12_mths_ex_med','url',
            'policy_code','pymnt_plan','delinq_amnt','acc_now_delinq','application_type',
            'initial_list_status']
master_frame = master_frame.drop(drop_list, axis = 1)

In [72]:
master_frame.shape

(39717, 44)

In [73]:
#checking the percent of null after removal of the above columns
new_null_percent = master_frame.isnull().sum() * 100 / len(master_frame)
new_null_percent[new_null_percent>0]

emp_title                  6.191303
emp_length                 2.706650
title                      0.027696
mths_since_last_delinq    64.662487
revol_util                 0.125891
last_pymnt_d               0.178765
last_credit_pull_d         0.005036
pub_rec_bankruptcies       1.754916
dtype: float64