# Lending Club Case Study

## Import packages

In [116]:
#Import necessary packages
import chardet
import pandas as pd

## Load dataset

In [117]:
#check the encoding of input data:
with open('loan.csv','rb') as raw_data:
    result = chardet.detect(raw_data.read(2000))
print(result)

{'encoding': 'ascii', 'confidence': 1.0, 'language': ''}


In [118]:
#load the loan dataset
loan_data = pd.read_csv('loan.csv',dtype='unicode')

In [119]:
loan_data.head()

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,...,num_tl_90g_dpd_24m,num_tl_op_past_12m,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit
0,1077501,1296599,5000,5000,4975,36 months,10.65%,162.87,B,B2,...,,,,,0,0,,,,
1,1077430,1314167,2500,2500,2500,60 months,15.27%,59.83,C,C4,...,,,,,0,0,,,,
2,1077175,1313524,2400,2400,2400,36 months,15.96%,84.33,C,C5,...,,,,,0,0,,,,
3,1076863,1277178,10000,10000,10000,36 months,13.49%,339.31,C,C1,...,,,,,0,0,,,,
4,1075358,1311748,3000,3000,3000,60 months,12.69%,67.79,B,B5,...,,,,,0,0,,,,


In [120]:
loan_data.shape

(39717, 111)

## Data cleaning

### Removing unnecessary and null columns

In [121]:
loan_data.isnull().sum()

id                                0
member_id                         0
loan_amnt                         0
funded_amnt                       0
funded_amnt_inv                   0
                              ...  
tax_liens                        39
tot_hi_cred_lim               39717
total_bal_ex_mort             39717
total_bc_limit                39717
total_il_high_credit_limit    39717
Length: 111, dtype: int64

There are columns with many Null values. We can drop these rows as they won't be of use to us in the analysis. We will drop rows that contain more that 80 percent as null values.

In [122]:
null_percent = loan_data.isnull().sum() * 100 / len(loan_data)
null_percent

id                              0.000000
member_id                       0.000000
loan_amnt                       0.000000
funded_amnt                     0.000000
funded_amnt_inv                 0.000000
                                 ...    
tax_liens                       0.098195
tot_hi_cred_lim               100.000000
total_bal_ex_mort             100.000000
total_bc_limit                100.000000
total_il_high_credit_limit    100.000000
Length: 111, dtype: float64

In [123]:
#Get the columns that does has more than 80% null values. 
null_percent_df = pd.DataFrame({'column_name': loan_data.columns,
                               'null_percent': null_percent.round(2)})
(null_percent_df.null_percent > 80).sum()

56

There seems to be 56 columns with percent of null values exceeding 80 percent. We can drop these columns.

In [124]:
#get the column names to be dropped.
columns_to_drop = null_percent_df[null_percent_df.null_percent > 80].column_name.tolist()
master_frame = loan_data.drop(columns_to_drop, axis = 1)
master_frame.shape

(39717, 55)

In [125]:
#checking the percent of null after removal of null columns
new_null_percent = master_frame.isnull().sum() * 100 / len(master_frame)
new_null_percent[new_null_percent>0]

emp_title                      6.191303
emp_length                     2.706650
desc                          32.580507
title                          0.027696
mths_since_last_delinq        64.662487
revol_util                     0.125891
last_pymnt_d                   0.178765
last_credit_pull_d             0.005036
collections_12_mths_ex_med     0.140998
chargeoff_within_12_mths       0.140998
pub_rec_bankruptcies           1.754916
tax_liens                      0.098195
dtype: float64

In [126]:
master_frame['chargeoff_within_12_mths'].value_counts()

0    39661
Name: chargeoff_within_12_mths, dtype: int64

In [127]:
master_frame['tax_liens'].value_counts()

0    39678
Name: tax_liens, dtype: int64

In [128]:
master_frame['collections_12_mths_ex_med'].value_counts()

0    39661
Name: collections_12_mths_ex_med, dtype: int64

In [129]:
master_frame['policy_code'].value_counts()

1    39717
Name: policy_code, dtype: int64

In [130]:
master_frame['pymnt_plan'].value_counts()

n    39717
Name: pymnt_plan, dtype: int64

In [131]:
master_frame['delinq_amnt'].value_counts()

0    39717
Name: delinq_amnt, dtype: int64

In [132]:
master_frame['acc_now_delinq'].value_counts()

0    39717
Name: acc_now_delinq, dtype: int64

In [133]:
master_frame['application_type'].value_counts()

INDIVIDUAL    39717
Name: application_type, dtype: int64

In [134]:
master_frame['initial_list_status'].value_counts()

f    39717
Name: initial_list_status, dtype: int64

### Unnecessary columns:

desc: Loan description provided by the borrower. This is a text field contains descirptive data. Processing this is beyond the scope of this assingment. The issued month can be got from the 'issue_d' field.

url: the URL does not help with the analysis and is the same for all entries with the difference being the ID ( which is a separate column ). 

chargeoff_within_12_mths: 'chargeoff_within_12_mths' only has '0' and null and does not contribute to the analysis.

tax_liens: 'tax_liens' only has '0' and null and does not contribute to the analysis.

collections_12_mths_ex_med: 'collections_12_mths_ex_meds' only has '0' and null and does not contribute to the analysis.

policy_code: Has only value '1'

pymnt_plan: has only value 'n'

delinq_amnt: has only value '0'

acc_now_delinq: has only value '0'

application_type: has only type 'INDIVIDUAL'

initial_list_status: has only value 'f'






In [135]:
#dropping the columsn mentioned above
drop_list = ['desc','tax_liens','chargeoff_within_12_mths','collections_12_mths_ex_med','url',
            'policy_code','pymnt_plan','delinq_amnt','acc_now_delinq','application_type',
            'initial_list_status']
master_frame = master_frame.drop(drop_list, axis = 1)

In [136]:
master_frame.shape

(39717, 44)

In [137]:
#checking the percent of null after removal of the above columns
new_null_percent = master_frame.isnull().sum() * 100 / len(master_frame)
new_null_percent[new_null_percent>0]

emp_title                  6.191303
emp_length                 2.706650
title                      0.027696
mths_since_last_delinq    64.662487
revol_util                 0.125891
last_pymnt_d               0.178765
last_credit_pull_d         0.005036
pub_rec_bankruptcies       1.754916
dtype: float64

In [138]:
#number of rows having atleast one NaN value
print (master_frame.isna().any(axis=1).sum())

27218


In [139]:
#Percent of NaN values by row
null_count_row = master_frame.isnull().sum(axis = 1)*100/len(master_frame.columns)
#null_count_row.sort(reverse = True)
#null_count_row.value_counts()
null_count_row[null_count_row > 0].sort_values(ascending=False)

16719    9.090909
4714     9.090909
24984    9.090909
14839    9.090909
28547    9.090909
           ...   
26148    2.272727
26149    2.272727
26150    2.272727
26151    2.272727
19856    2.272727
Length: 27218, dtype: float64

The percent of null values in each row is less than 10 percent. So we will leave it as is and try to impute the remaining null values for the columns.

### Unnecessary rows

In [140]:
master_frame.loan_status.value_counts()

Fully Paid     32950
Charged Off     5627
Current         1140
Name: loan_status, dtype: int64

We don't need rows with loan_status = 'Current' as the borrowers are still paying the loan.

In [141]:
master_frame = master_frame[master_frame.loan_status != 'Current']

In [142]:
#checking the percent of null after removal of the above columns
new_null_percent = master_frame.isnull().sum() * 100 / len(master_frame)
new_null_percent[new_null_percent>0]

emp_title                  6.185033
emp_length                 2.677761
title                      0.028514
mths_since_last_delinq    64.559193
revol_util                 0.129611
last_pymnt_d               0.184047
last_credit_pull_d         0.005184
pub_rec_bankruptcies       1.806776
dtype: float64

mths_since_last_delinq still has a considerable amount of null values. Imputing the columns will not give any meaning full information. So we can drop it.

In [143]:
master_frame = master_frame.drop(['mths_since_last_delinq'], axis = 1)

### Imputing the remaining columns having missing values

In [147]:
#checking the percent of null after removal of the above columns
new_null_percent = master_frame.isnull().sum() * 100 / len(master_frame)
new_null_percent[new_null_percent>0]

emp_title               6.185033
emp_length              2.677761
title                   0.028514
revol_util              0.129611
last_pymnt_d            0.184047
last_credit_pull_d      0.005184
pub_rec_bankruptcies    1.806776
dtype: float64

## To-do:
1. emp_title     => null to 'others'
2. emp_length    => null to '0 years' ?
                    convert to numeric ?
3. title         => null to 'others'
4. revol_util    => to numeric  
                    remove percent    
                    binning?      
                    mean, mode or median to impute?
5. last_pymnt_d  => to date format    
                    remove null rows?
6. last_credit_pull_d    => to date format    
                            remove null rows?
7. pub_rec_bankruptcies  => null values to '0'

In [156]:
#Replacing all NaN values with 'Others'
master_frame['emp_title'].fillna('Others',inplace=True)

In [161]:
master_frame.emp_length.value_counts()

10+ years    8488
< 1 year     4508
2 years      4291
3 years      4012
4 years      3342
5 years      3194
1 year       3169
6 years      2168
7 years      1711
8 years      1435
9 years      1226
Name: emp_length, dtype: int64

In [145]:
master_frame.emp_length.isnull().sum()

1033

In [146]:
master_frame.shape

(38577, 43)

In [164]:
master_frame[master_frame.emp_length.isnull()]['loan_status']

168      Charged Off
323       Fully Paid
394       Fully Paid
422      Charged Off
439       Fully Paid
            ...     
32591     Fully Paid
32608     Fully Paid
32621    Charged Off
32631     Fully Paid
32665     Fully Paid
Name: loan_status, Length: 1033, dtype: object