In [1]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SequentialFeatureSelector

from imblearn.pipeline import Pipeline
from sklearn.model_selection import RepeatedStratifiedKFold, cross_val_score

In [2]:
pd.set_option("display.max_columns", 200)
pd.set_option("display.max_row", 160)
pd.set_option('max_colwidth', 200)

In [4]:
data = pd.read_csv('../../Data/20x44_DEFG105015.csv')

In [5]:
data.head(5)

Unnamed: 0,loan_amnt,annual_inc,dti,fico_range_low,fico_range_high,int_rate,installment,open_acc,total_acc,revol_bal,inq_last_6mths,delinq_2yrs,mths_since_last_delinq,acc_now_delinq,collections_12_mths_ex_med,pub_rec,months_cr_line,term,initial_list_status,grade,sub_grade,emp_length,ANY,MORTGAGE,OWN,RENT,car,credit_card,debt_consolidation,educational,home_improvement,house,major_purchase,medical,moving,other,renewable_energy,small_business,vacation,wedding,Not Verified,Source Verified,Verified,loan_status_bin
0,-0.66416,-0.023755,-0.706543,1.833773,1.833773,-0.567883,-0.652099,-0.226954,-0.731263,-0.51321,1.245929,0.409773,-0.27875,-0.070589,-0.149713,-0.425914,-0.797836,-0.269164,1.417837,-0.061924,0.504395,-0.438425,-0.030504,-0.725596,-0.360968,0.925021,-0.127976,-0.220791,-0.755699,-0.017606,3.40668,-0.153198,-0.19396,-0.208143,-0.227652,-0.571267,-0.074895,-0.250822,-0.162479,-0.088361,-0.589398,1.219856,-0.718147,0.0
1,-0.928384,-0.205881,1.267538,0.096737,0.096737,0.610386,-0.863047,-0.637137,-0.819168,-0.393781,0.468999,-0.442798,-0.863295,-0.070589,-0.149713,-0.425914,0.398094,-0.269164,1.417837,1.297452,1.448344,-1.496217,-0.030504,1.378177,-0.360968,-1.081056,-0.127976,-0.220791,1.323278,-0.017606,-0.293541,-0.153198,-0.19396,-0.208143,-0.227652,-0.571267,-0.074895,-0.250822,-0.162479,-0.088361,-0.589398,1.219856,-0.718147,0.0
2,0.931751,0.071267,-0.910159,1.089329,1.089329,1.784386,1.176933,-0.226954,-0.819168,-0.213828,-1.084861,-0.442798,-0.863295,-0.070589,-0.149713,-0.425914,-1.334784,-0.269164,1.417837,2.656828,2.392294,-0.438425,-0.030504,-0.725596,-0.360968,0.925021,-0.127976,-0.220791,-0.755699,-0.017606,-0.293541,-0.153198,-0.19396,-0.208143,-0.227652,-0.571267,-0.074895,3.98689,-0.162479,-0.088361,-0.589398,1.219856,-0.718147,1.0
3,1.608163,0.229638,0.57723,0.096737,0.096737,-0.777068,1.578133,-0.226954,-0.291743,-0.216527,-1.084861,-0.442798,1.250061,-0.070589,-0.149713,3.486114,-0.358515,-0.269164,1.417837,-0.061924,0.504395,-1.496217,-0.030504,-0.725596,-0.360968,0.925021,-0.127976,-0.220791,1.323278,-0.017606,-0.293541,-0.153198,-0.19396,-0.208143,-0.227652,-0.571267,-0.074895,-0.250822,-0.162479,-0.088361,-0.589398,1.219856,-0.718147,1.0
4,-0.917815,-0.166288,0.070674,-0.647707,-0.647707,-0.777068,-0.912962,0.798502,0.851011,0.024086,0.468999,-0.442798,-0.863295,-0.070589,-0.149713,-0.425914,-0.541565,-0.269164,1.417837,-0.061924,0.504395,-0.967321,-0.030504,1.378177,-0.360968,-1.081056,-0.127976,-0.220791,1.323278,-0.017606,-0.293541,-0.153198,-0.19396,-0.208143,-0.227652,-0.571267,-0.074895,-0.250822,-0.162479,-0.088361,-0.589398,1.219856,-0.718147,1.0


In [6]:
data.shape

(3227, 44)

In [7]:
data.loan_status_bin.value_counts()

1.0    2470
0.0     757
Name: loan_status_bin, dtype: int64

In [8]:
#Null Model (or random investment) will yield, 59.3% of picking a good loan: 
data.loan_status_bin.value_counts()[1] / len(data)

0.7654167957855593

In [9]:
X = data.drop('loan_status_bin', axis=1)
X.shape

(3227, 43)

In [10]:
y = data['loan_status_bin']

In [11]:
n_features = 3
lr = LogisticRegression()
sfs = SequentialFeatureSelector(lr, n_features_to_select=n_features)

In [12]:
sfs.fit(X,y)

In [13]:
#sfs.get_support()

In [14]:
print("Top {} features selected by forward sequential selection:{}"\
      .format(n_features, list(X.columns[sfs.get_support()])))


Top 3 features selected by forward sequential selection:['loan_amnt', 'int_rate', 'total_acc']


In [15]:
feat = list(X.columns[sfs.get_support()])
feat

['loan_amnt', 'int_rate', 'total_acc']

In [16]:
X_feat = data[feat]

In [17]:
y = data['loan_status_bin']
y

0       0.0
1       0.0
2       1.0
3       1.0
4       1.0
       ... 
3222    1.0
3223    0.0
3224    0.0
3225    1.0
3226    1.0
Name: loan_status_bin, Length: 3227, dtype: float64

In [18]:
lr_check = LogisticRegression()
steps = [('model', lr_check)]
pipeline = Pipeline(steps=steps)

In [19]:
#Accuracy:

cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=1, random_state=1)
scores = cross_val_score(pipeline, X_feat, y, scoring='accuracy', cv=cv, n_jobs=-1) 
print(scores)

score = np.mean(scores)
print(score)

[0.76470588 0.76160991 0.76589147 0.76589147 0.76744186]
0.7651081191350468


In [20]:
from sklearn.ensemble import AdaBoostClassifier
ada = AdaBoostClassifier()
steps2 = [('model', ada)]
pipeline2 = Pipeline(steps=steps2)

In [21]:
#Accuracy:

cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=1, random_state=1)
scores = cross_val_score(pipeline2, X_feat, y, scoring='accuracy', cv=cv, n_jobs=-1) 
print(scores)

score = np.mean(scores)
print(score)

[0.76780186 0.74922601 0.75658915 0.7627907  0.77054264]
0.7613900688794489


In [22]:
from sklearn.neural_network import MLPClassifier
nn = MLPClassifier(alpha=1, max_iter=1000)
steps3 = [('model', nn)]
pipeline3 = Pipeline(steps=steps3)

In [23]:
#Accuracy:

cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=1, random_state=1)
scores = cross_val_score(pipeline3, X_feat, y, scoring='accuracy', cv=cv, n_jobs=-1) 
print(scores)

score = np.mean(scores)
print(score)

[0.76470588 0.76160991 0.76589147 0.76899225 0.76589147]
0.7654181966544267


In [24]:
from sklearn.ensemble import GradientBoostingClassifier
gb = GradientBoostingClassifier()
steps4 = [('model', gb)]
pipeline4 = Pipeline(steps=steps4)

In [25]:
#Accuracy:

cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=1, random_state=1)
scores = cross_val_score(pipeline4, X_feat, y, scoring='accuracy', cv=cv, n_jobs=-1) 
print(scores)

score = np.mean(scores)
print(score)

[0.76470588 0.73839009 0.75503876 0.75503876 0.76744186]
0.7561230710154319
