In [3]:
import pandas as pd
import numpy as np
from imblearn.over_sampling import SMOTE
from collections import Counter
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler

pd.set_option('display.float', '{:.2f}'.format)

df = pd.read_csv('lending_club_loan_two.csv')

#Select columns
df = df[['loan_amnt', 'term', 'int_rate', 'installment', 'sub_grade',
    'emp_length', 'home_ownership', 'annual_inc',
    'verification_status', 'loan_status', 'purpose',
    'dti', 'open_acc', 'pub_rec', 'revol_bal',
    'revol_util', 'total_acc', 'initial_list_status', 'application_type',
    'mort_acc', 'pub_rec_bankruptcies']]

#Missing Value
df['emp_length'] = df['emp_length'].fillna(df['emp_length'].mode()[0])
df['mort_acc'] = df['mort_acc'].fillna(df['mort_acc'].mean())
df['revol_util'] = df['revol_util'].fillna(df['revol_util'].mean())
df['pub_rec_bankruptcies'] = df['pub_rec_bankruptcies'].fillna(df['pub_rec_bankruptcies'].mean())


#Transform Data
df['term'] = df['term'].str[:3].astype(int)

def data_tran(col):
    l = list(df[col].value_counts().index)
    conv_l = {l[i]:i for i in range(len(l))}
    df[col] = df[col].apply(conv_l.get)

column_list = ['emp_length','sub_grade','verification_status','loan_status','home_ownership','purpose','initial_list_status','application_type','loan_status']
for i in column_list:
    data_tran(i)

#Outlier
df = df[df['annual_inc'] <= 250000]
df = df[df['dti'] <= 50]
df = df[df['open_acc'] <= 40]
df = df[df['total_acc'] <= 80]
df = df[df['revol_util'] <= 120]
df = df[df['revol_bal'] <= 250000]

#Imbalance
over = SMOTE(sampling_strategy=0.4)
under = RandomUnderSampler(sampling_strategy=0.7)

steps = [('o', over), ('u', under)]
pipeline = Pipeline(steps=steps)

df_balance, df_balance['loan_status']= pipeline.fit_resample(df.drop(['loan_status'],axis=1), df['loan_status'])

df = df_balance

In [4]:
df.head()

Unnamed: 0,loan_amnt,term,int_rate,installment,sub_grade,emp_length,home_ownership,annual_inc,verification_status,purpose,...,open_acc,pub_rec,revol_bal,revol_util,total_acc,initial_list_status,application_type,mort_acc,pub_rec_bankruptcies,loan_status
147253,8000.0,36,18.49,291.19,13,4,1,45000.0,0,0,...,5.0,0.0,2978.0,52.2,13.0,0,0,0.0,0.0,0
334671,6000.0,36,16.29,211.81,10,7,1,48000.0,0,1,...,8.0,0.0,12948.0,59.4,10.0,0,0,1.81,0.0,0
279155,13000.0,60,13.35,298.13,3,8,1,39800.0,1,1,...,13.0,0.0,11698.0,57.9,30.0,1,0,0.0,0.0,0
183432,8000.0,36,21.6,303.88,22,8,1,65000.0,2,0,...,15.0,0.0,12682.0,86.3,23.0,0,0,0.0,0.0,0
145765,14000.0,36,13.49,475.03,3,3,0,49000.0,0,0,...,17.0,0.0,41817.0,64.0,29.0,1,0,3.0,0.0,0


In [8]:
df[['pub_rec_bankruptcies','pub_rec','mort_acc','total_acc','annual_inc','revol_bal']] = df[['pub_rec_bankruptcies','pub_rec','mort_acc','total_acc','annual_inc','revol_bal']].apply(lambda x : np.log(x+1))

In [9]:
scaler = MinMaxScaler()
df_scaled = scaler.fit_transform(df)
df_scaled

array([[0.18987342, 0.        , 0.51305025, ..., 0.        , 0.        ,
        0.        ],
       [0.13924051, 0.        , 0.4273471 , ..., 0.29852352, 0.        ,
        0.        ],
       [0.3164557 , 1.        , 0.31281652, ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.29113924, 0.16666667, 0.37585916, ..., 0.29663423, 0.        ,
        1.        ],
       [0.24050633, 0.41666667, 0.50840984, ..., 0.22215382, 0.        ,
        1.        ],
       [0.67963886, 1.        , 0.52211706, ..., 0.38529632, 0.        ,
        1.        ]])