In [1]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SequentialFeatureSelector

from imblearn.pipeline import Pipeline
from sklearn.model_selection import RepeatedStratifiedKFold, cross_val_score

In [2]:
pd.set_option("display.max_columns", 200)
pd.set_option("display.max_row", 160)
pd.set_option('max_colwidth', 200)

In [3]:
data = pd.read_csv('../../Data/20x43_DEFG106017_5.csv')

In [4]:
data.head(5)

Unnamed: 0,loan_amnt,annual_inc,dti,fico_range_low,fico_range_high,int_rate,installment,open_acc,total_acc,revol_bal,inq_last_6mths,delinq_2yrs,mths_since_last_delinq,acc_now_delinq,collections_12_mths_ex_med,pub_rec,months_cr_line,term,initial_list_status,grade,sub_grade,emp_length,ANY,MORTGAGE,OWN,RENT,car,credit_card,debt_consolidation,home_improvement,house,major_purchase,medical,moving,other,renewable_energy,small_business,vacation,wedding,Not Verified,Source Verified,Verified,loan_status_bin
0,-0.688961,-0.098407,-0.878946,1.908757,1.908757,-0.572337,-0.680069,-0.344434,-0.847799,-0.559022,1.176249,0.379582,-0.304934,-0.078675,-0.150104,-0.440423,-0.879612,-0.255268,1.415665,-0.029548,0.54369,-0.520336,-0.036999,-0.797214,-0.351647,1.0,-0.126381,-0.231636,-0.749244,3.41565,-0.157698,-0.194825,-0.220784,-0.218218,-0.561299,-0.069338,-0.252217,-0.17609,-0.08901,-0.598141,1.218142,-0.708013,0.0
1,-0.955672,-0.272715,0.853446,0.113095,0.113095,0.621456,-0.892889,-0.745154,-0.932905,-0.449493,0.415315,-0.463741,-0.896243,-0.078675,-0.150104,-0.440423,0.314916,-0.255268,1.415665,1.342791,1.503181,-1.587261,-0.036999,1.254368,-0.351647,-1.0,-0.126381,-0.231636,1.334679,-0.29277,-0.157698,-0.194825,-0.220784,-0.218218,-0.561299,-0.069338,-0.252217,-0.17609,-0.08901,-0.598141,1.218142,-0.708013,0.0
2,0.143178,-0.08325,1.574731,-0.656475,-0.656475,-0.572337,0.145642,0.256648,0.93942,-0.097825,2.698116,2.909549,-0.896243,-0.078675,-0.150104,0.894995,-0.355482,-0.255268,1.415665,-0.029548,0.54369,-0.520336,-0.036999,1.254368,-0.351647,-1.0,-0.126381,-0.231636,-0.749244,-0.29277,-0.157698,-0.194825,-0.220784,-0.218218,-0.561299,-0.069338,3.964846,-0.17609,-0.08901,-0.598141,1.218142,-0.708013,1.0
3,-0.571608,-0.242401,1.060462,-0.912998,-0.912998,1.408668,-0.452957,-0.344434,-0.422271,-0.422451,2.698116,0.379582,0.331859,-0.078675,-0.150104,0.894995,-0.562696,-0.255268,1.415665,1.342791,2.142842,0.013127,-0.036999,1.254368,-0.351647,-1.0,-0.126381,-0.231636,1.334679,-0.29277,-0.157698,-0.194825,-0.220784,-0.218218,-0.561299,-0.069338,-0.252217,-0.17609,-0.08901,-0.598141,1.218142,-0.708013,1.0
4,0.207189,-0.227244,1.467955,-0.143429,-0.143429,-0.99622,0.175789,0.256648,-0.081848,0.130144,-0.345619,-0.463741,-0.896243,-0.078675,-0.150104,0.894995,1.16815,-0.255268,1.415665,-0.029548,0.223859,1.080052,-0.036999,1.254368,-0.351647,-1.0,-0.126381,-0.231636,-0.749244,3.41565,-0.157698,-0.194825,-0.220784,-0.218218,-0.561299,-0.069338,-0.252217,-0.17609,-0.08901,-0.598141,-0.820922,1.412403,1.0


In [5]:
data.shape

(2926, 43)

In [6]:
data.loan_status_bin.value_counts()

1.0    2260
0.0     666
Name: loan_status_bin, dtype: int64

In [7]:
#Null Model (or random investment) will yield, 59.3% of picking a good loan: 
data.loan_status_bin.value_counts()[1] / len(data)

0.7723855092276145

In [8]:
X = data.drop('loan_status_bin', axis=1)
X.shape

(2926, 42)

In [9]:
y = data['loan_status_bin']

In [10]:
n_features = 3
lr = LogisticRegression()
sfs = SequentialFeatureSelector(lr, n_features_to_select=n_features)

In [11]:
sfs.fit(X,y)

In [12]:
#sfs.get_support()

In [13]:
print("Top {} features selected by forward sequential selection:{}"\
      .format(n_features, list(X.columns[sfs.get_support()])))


Top 3 features selected by forward sequential selection:['loan_amnt', 'annual_inc', 'fico_range_low']


In [14]:
feat = list(X.columns[sfs.get_support()])
feat

['loan_amnt', 'annual_inc', 'fico_range_low']

In [15]:
X_feat = data[feat]

In [16]:
y = data['loan_status_bin']
y

0       0.0
1       0.0
2       1.0
3       1.0
4       1.0
       ... 
2921    1.0
2922    1.0
2923    0.0
2924    0.0
2925    1.0
Name: loan_status_bin, Length: 2926, dtype: float64

In [17]:
lr_check = LogisticRegression()
steps = [('model', lr_check)]
pipeline = Pipeline(steps=steps)

In [18]:
#Accuracy:

cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=1, random_state=1)
scores = cross_val_score(pipeline, X_feat, y, scoring='accuracy', cv=cv, n_jobs=-1) 
print(scores)

score = np.mean(scores)
print(score)

[0.77133106 0.77606838 0.77264957 0.77264957 0.77264957]
0.7730696304075144


In [19]:
from sklearn.ensemble import AdaBoostClassifier
ada = AdaBoostClassifier()
steps2 = [('model', ada)]
pipeline2 = Pipeline(steps=steps2)

In [20]:
#Accuracy:

cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=1, random_state=1)
scores = cross_val_score(pipeline2, X_feat, y, scoring='accuracy', cv=cv, n_jobs=-1) 
print(scores)

score = np.mean(scores)
print(score)

[0.77474403 0.77606838 0.77094017 0.75897436 0.76923077]
0.7699915405034858


In [21]:
from sklearn.neural_network import MLPClassifier
nn = MLPClassifier(alpha=1, max_iter=1000)
steps3 = [('model', nn)]
pipeline3 = Pipeline(steps=steps3)

In [22]:
#Accuracy:

cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=1, random_state=1)
scores = cross_val_score(pipeline3, X_feat, y, scoring='accuracy', cv=cv, n_jobs=-1) 
print(scores)

score = np.mean(scores)
print(score)

[0.77303754 0.77264957 0.77264957 0.77094017 0.77264957]
0.772385286310201


In [23]:
from sklearn.ensemble import GradientBoostingClassifier
gb = GradientBoostingClassifier()
steps4 = [('model', gb)]
pipeline4 = Pipeline(steps=steps4)

In [24]:
#Accuracy:

cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=1, random_state=1)
scores = cross_val_score(pipeline4, X_feat, y, scoring='accuracy', cv=cv, n_jobs=-1) 
print(scores)

score = np.mean(scores)
print(score)

[0.77645051 0.77264957 0.76923077 0.76410256 0.76923077]
0.7703328374318136
