In [1]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SequentialFeatureSelector

from imblearn.pipeline import Pipeline
from sklearn.model_selection import RepeatedStratifiedKFold, cross_val_score

In [2]:
pd.set_option("display.max_columns", 200)
pd.set_option("display.max_row", 160)
pd.set_option('max_colwidth', 200)

In [3]:
data = pd.read_csv('../../Data/20x43_DEFG86017_5.csv')

In [4]:
data.head(5)

Unnamed: 0,loan_amnt,annual_inc,dti,fico_range_low,fico_range_high,int_rate,installment,open_acc,total_acc,revol_bal,inq_last_6mths,delinq_2yrs,mths_since_last_delinq,acc_now_delinq,collections_12_mths_ex_med,pub_rec,months_cr_line,term,initial_list_status,grade,sub_grade,emp_length,ANY,MORTGAGE,OWN,RENT,car,credit_card,debt_consolidation,home_improvement,house,major_purchase,medical,moving,other,renewable_energy,small_business,vacation,wedding,Not Verified,Source Verified,Verified,loan_status_bin
0,-0.39478,-0.092019,-0.873321,1.940594,1.940594,-0.58395,-0.393509,-0.321346,-0.841857,-0.819426,1.187098,0.393653,-0.31313,-0.069643,-0.151296,-0.426999,-0.884191,-0.239876,1.395799,-0.043589,0.530201,-0.529283,-0.029637,-0.791641,-0.356514,0.997808,-0.13366,-0.217565,-0.712232,3.435603,-0.148233,-0.194412,-0.235757,-0.226273,-0.590356,-0.069643,-0.243941,-0.193193,-0.089225,-0.627602,1.248119,-0.696197,0.0
1,-0.742344,-0.248924,0.844724,0.113283,0.113283,0.615513,-0.670219,-0.723183,-0.928005,-0.652416,0.423724,-0.46321,-0.900363,-0.069643,-0.151296,-0.426999,0.311603,-0.239876,1.395799,1.317237,1.486158,-1.592515,-0.029637,1.263199,-0.356514,-1.002196,-0.13366,-0.217565,1.404036,-0.29107,-0.148233,-0.194412,-0.235757,-0.226273,-0.590356,-0.069643,-0.243941,-0.193193,-0.089225,-0.627602,1.248119,-0.696197,0.0
2,0.689619,-0.078375,1.560036,-0.66985,-0.66985,-0.58395,0.680086,0.281409,0.967242,-0.116192,2.713845,2.964244,-0.900363,-0.069643,-0.151296,0.873978,-0.359506,-0.239876,1.395799,-0.043589,0.530201,-0.529283,-0.029637,1.263199,-0.356514,-1.002196,-0.13366,-0.217565,-0.712232,-0.29107,-0.148233,-0.194412,-0.235757,-0.226273,-0.590356,-0.069643,4.099352,-0.193193,-0.089225,-0.627602,1.248119,-0.696197,1.0
3,-0.241852,-0.221636,1.050025,-0.930894,-0.930894,1.406463,-0.098216,-0.321346,-0.411119,-0.611183,2.713845,0.393653,0.319274,-0.069643,-0.151296,0.873978,-0.566939,-0.239876,1.395799,1.317237,2.123462,0.002333,-0.029637,1.263199,-0.356514,-1.002196,-0.13366,-0.217565,1.404036,-0.29107,-0.148233,-0.194412,-0.235757,-0.226273,-0.590356,-0.069643,-0.243941,-0.193193,-0.089225,-0.627602,1.248119,-0.696197,1.0
4,0.773034,-0.207992,1.454144,-0.147761,-0.147761,-1.009846,0.719284,0.281409,-0.066529,0.231414,-0.339649,-0.46321,-0.900363,-0.069643,-0.151296,0.873978,1.165741,-0.239876,1.395799,-0.043589,0.211549,1.065565,-0.029637,1.263199,-0.356514,-1.002196,-0.13366,-0.217565,-0.712232,3.435603,-0.148233,-0.194412,-0.235757,-0.226273,-0.590356,-0.069643,-0.243941,-0.193193,-0.089225,-0.627602,-0.801206,1.436375,1.0


In [5]:
data.shape

(2279, 43)

In [6]:
data.loan_status_bin.value_counts()

1.0    1786
0.0     493
Name: loan_status_bin, dtype: int64

In [7]:
#Null Model (or random investment) will yield, 59.3% of picking a good loan: 
data.loan_status_bin.value_counts()[1] / len(data)

0.7836770513383062

In [8]:
X = data.drop('loan_status_bin', axis=1)
X.shape

(2279, 42)

In [9]:
y = data['loan_status_bin']

In [10]:
n_features = 3
lr = LogisticRegression()
sfs = SequentialFeatureSelector(lr, n_features_to_select=n_features)

In [11]:
sfs.fit(X,y)

In [12]:
#sfs.get_support()

In [13]:
print("Top {} features selected by forward sequential selection:{}"\
      .format(n_features, list(X.columns[sfs.get_support()])))


Top 3 features selected by forward sequential selection:['loan_amnt', 'int_rate', 'revol_bal']


In [14]:
feat = list(X.columns[sfs.get_support()])
feat

['loan_amnt', 'int_rate', 'revol_bal']

In [15]:
X_feat = data[feat]

In [16]:
y = data['loan_status_bin']
y

0       0.0
1       0.0
2       1.0
3       1.0
4       1.0
       ... 
2274    1.0
2275    1.0
2276    0.0
2277    0.0
2278    1.0
Name: loan_status_bin, Length: 2279, dtype: float64

In [17]:
lr_check = LogisticRegression()
steps = [('model', lr_check)]
pipeline = Pipeline(steps=steps)

In [18]:
#Accuracy:

cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=1, random_state=1)
scores = cross_val_score(pipeline, X_feat, y, scoring='accuracy', cv=cv, n_jobs=-1) 
print(scores)

score = np.mean(scores)
print(score)

[0.78508772 0.78289474 0.78289474 0.7872807  0.78461538]
0.7845546558704454


In [19]:
from sklearn.ensemble import AdaBoostClassifier
ada = AdaBoostClassifier()
steps2 = [('model', ada)]
pipeline2 = Pipeline(steps=steps2)

In [20]:
#Accuracy:

cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=1, random_state=1)
scores = cross_val_score(pipeline2, X_feat, y, scoring='accuracy', cv=cv, n_jobs=-1) 
print(scores)

score = np.mean(scores)
print(score)

[0.77850877 0.78289474 0.78070175 0.7872807  0.78241758]
0.7823607094659726


In [21]:
from sklearn.neural_network import MLPClassifier
nn = MLPClassifier(alpha=1, max_iter=1000)
steps3 = [('model', nn)]
pipeline3 = Pipeline(steps=steps3)

In [22]:
#Accuracy:

cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=1, random_state=1)
scores = cross_val_score(pipeline3, X_feat, y, scoring='accuracy', cv=cv, n_jobs=-1) 
print(scores)

score = np.mean(scores)
print(score)

[0.78289474 0.78289474 0.78289474 0.78508772 0.78461538]
0.7836774628879892


In [23]:
from sklearn.ensemble import GradientBoostingClassifier
gb = GradientBoostingClassifier()
steps4 = [('model', gb)]
pipeline4 = Pipeline(steps=steps4)

In [24]:
#Accuracy:

cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=1, random_state=1)
scores = cross_val_score(pipeline4, X_feat, y, scoring='accuracy', cv=cv, n_jobs=-1) 
print(scores)

score = np.mean(scores)
print(score)

[0.77412281 0.76754386 0.77850877 0.77412281 0.77802198]
0.7744640447272026
