In [27]:
import pandas as pd
import numpy as np
from imblearn.over_sampling import SMOTE
from collections import Counter
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline
pd.set_option('display.float', '{:.2f}'.format)

df = pd.read_csv('lending_club_loan_two.csv')
df.head()

Unnamed: 0,loan_amnt,term,int_rate,installment,grade,sub_grade,emp_title,emp_length,home_ownership,annual_inc,...,open_acc,pub_rec,revol_bal,revol_util,total_acc,initial_list_status,application_type,mort_acc,pub_rec_bankruptcies,address
0,10000.0,36 months,11.44,329.48,B,B4,Marketing,10+ years,RENT,117000.0,...,16.0,0.0,36369.0,41.8,25.0,w,INDIVIDUAL,0.0,0.0,"0174 Michelle Gateway\r\nMendozaberg, OK 22690"
1,8000.0,36 months,11.99,265.68,B,B5,Credit analyst,4 years,MORTGAGE,65000.0,...,17.0,0.0,20131.0,53.3,27.0,f,INDIVIDUAL,3.0,0.0,"1076 Carney Fort Apt. 347\r\nLoganmouth, SD 05113"
2,15600.0,36 months,10.49,506.97,B,B3,Statistician,< 1 year,RENT,43057.0,...,13.0,0.0,11987.0,92.2,26.0,f,INDIVIDUAL,0.0,0.0,"87025 Mark Dale Apt. 269\r\nNew Sabrina, WV 05113"
3,7200.0,36 months,6.49,220.65,A,A2,Client Advocate,6 years,RENT,54000.0,...,6.0,0.0,5472.0,21.5,13.0,f,INDIVIDUAL,0.0,0.0,"823 Reid Ford\r\nDelacruzside, MA 00813"
4,24375.0,60 months,17.27,609.33,C,C5,Destiny Management Inc.,9 years,MORTGAGE,55000.0,...,13.0,0.0,24584.0,69.8,43.0,f,INDIVIDUAL,1.0,0.0,"679 Luna Roads\r\nGreggshire, VA 11650"


In [28]:
df['loan_status'].value_counts()

loan_status
Fully Paid     318357
Charged Off     77673
Name: count, dtype: int64

In [29]:
df = df[['loan_amnt', 'term', 'int_rate', 'installment', 'sub_grade',
    'emp_length', 'home_ownership', 'annual_inc',
    'verification_status', 'loan_status', 'purpose',
    'dti', 'open_acc', 'pub_rec', 'revol_bal',
    'revol_util', 'total_acc', 'initial_list_status', 'application_type',
    'mort_acc', 'pub_rec_bankruptcies']]

In [30]:
for year in df.emp_length.unique():
    print(f"{year} years in this position:")
    print(f"{df[df.emp_length == year].loan_status.value_counts(normalize=True)}")
    print('==========================================')

10+ years years in this position:
loan_status
Fully Paid    0.82
Charged Off   0.18
Name: proportion, dtype: float64
4 years years in this position:
loan_status
Fully Paid    0.81
Charged Off   0.19
Name: proportion, dtype: float64
< 1 year years in this position:
loan_status
Fully Paid    0.79
Charged Off   0.21
Name: proportion, dtype: float64
6 years years in this position:
loan_status
Fully Paid    0.81
Charged Off   0.19
Name: proportion, dtype: float64
9 years years in this position:
loan_status
Fully Paid    0.80
Charged Off   0.20
Name: proportion, dtype: float64
2 years years in this position:
loan_status
Fully Paid    0.81
Charged Off   0.19
Name: proportion, dtype: float64
3 years years in this position:
loan_status
Fully Paid    0.80
Charged Off   0.20
Name: proportion, dtype: float64
8 years years in this position:
loan_status
Fully Paid    0.80
Charged Off   0.20
Name: proportion, dtype: float64
7 years years in this position:
loan_status
Fully Paid    0.81
Charged Off   

In [31]:
df['emp_length'] = df['emp_length'].fillna(df['emp_length'].mode()[0])
df['mort_acc'] = df['mort_acc'].fillna(df['mort_acc'].mean())
df['revol_util'] = df['revol_util'].fillna(df['revol_util'].mean())
df['pub_rec_bankruptcies'] = df['pub_rec_bankruptcies'].fillna(df['pub_rec_bankruptcies'].mean())

In [32]:
df['term'] = df['term'].str[:3].astype(int)

def data_tran(col):
    l = list(df[col].value_counts().index)
    conv_l = {l[i]:i for i in range(len(l))}
    df[col] = df[col].apply(conv_l.get)

column_list = ['emp_length','sub_grade','verification_status','loan_status','home_ownership','purpose','initial_list_status','application_type','loan_status']
for i in column_list:
    data_tran(i)

In [33]:
Counter(df['loan_status'])

Counter({0: 318357, 1: 77673})

In [34]:
over = SMOTE(sampling_strategy=0.4)
under = RandomUnderSampler(sampling_strategy=0.7)

steps = [('o', over), ('u', under)]
pipeline = Pipeline(steps=steps)

df_balance, df_balance['loan_status']= pipeline.fit_resample(df.drop(['loan_status'],axis=1), df['loan_status'])

In [35]:
Counter(df_balance['loan_status'])

Counter({0: 181917, 1: 127342})

In [9]:
df = df_balance

In [10]:
for year in df.emp_length.unique():
    print(f"{year} years in this position:")
    print(f"{df[df.emp_length == year].loan_status.value_counts(normalize=True)}")
    print('==========================================')

6 years in this position:
loan_status
0   0.59
1   0.41
Name: proportion, dtype: float64
2 years in this position:
loan_status
0   0.53
1   0.47
Name: proportion, dtype: float64
1 years in this position:
loan_status
0   0.54
1   0.46
Name: proportion, dtype: float64
8 years in this position:
loan_status
0   0.62
1   0.38
Name: proportion, dtype: float64
0 years in this position:
loan_status
0   0.61
1   0.39
Name: proportion, dtype: float64
9 years in this position:
loan_status
0   0.65
1   0.35
Name: proportion, dtype: float64
10 years in this position:
loan_status
0   0.69
1   0.31
Name: proportion, dtype: float64
7 years in this position:
loan_status
0   0.60
1   0.40
Name: proportion, dtype: float64
3 years in this position:
loan_status
0   0.56
1   0.44
Name: proportion, dtype: float64
5 years in this position:
loan_status
0   0.57
1   0.43
Name: proportion, dtype: float64
4 years in this position:
loan_status
0   0.56
1   0.44
Name: proportion, dtype: float64
