In [1]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SequentialFeatureSelector

from imblearn.pipeline import Pipeline
from sklearn.model_selection import RepeatedStratifiedKFold, cross_val_score

In [2]:
pd.set_option("display.max_columns", 200)
pd.set_option("display.max_row", 160)
pd.set_option('max_colwidth', 200)

In [3]:
data = pd.read_csv('../../Data/20x42_D86015.csv')

In [4]:
data.head(5)

Unnamed: 0,loan_amnt,annual_inc,dti,fico_range_low,fico_range_high,int_rate,installment,open_acc,total_acc,revol_bal,revol_util,inq_last_6mths,delinq_2yrs,acc_now_delinq,collections_12_mths_ex_med,tot_coll_amt,tot_cur_bal,pub_rec,months_cr_line,term,initial_list_status,sub_grade,emp_length,MORTGAGE,OWN,RENT,car,credit_card,debt_consolidation,home_improvement,house,major_purchase,medical,moving,other,renewable_energy,small_business,vacation,Not Verified,Source Verified,Verified,loan_status_bin
0,-1.681284,-0.164637,0.846384,-0.431787,-0.431787,-0.632827,-1.683588,-0.515811,0.052865,0.087789,0.27832,-0.948659,-0.486876,-0.049447,-0.149813,-0.100298,-0.612943,-0.371035,-0.0975,-0.070014,0.915732,-1.427248,0.003312,-0.74366,-0.372678,0.95702,-0.188025,-0.300708,-0.841625,-0.243786,-0.121867,-0.207983,-0.201517,-0.201517,1.84624,-0.049447,-0.149813,-0.166039,1.25643,-0.755567,-0.575473,0.0
1,0.239648,-0.034893,-0.000947,-0.727333,-0.727333,1.345915,0.28095,0.317617,0.145098,-0.41438,-0.692823,-0.08815,-0.486876,-0.049447,-0.149813,-0.100298,-0.609911,3.369727,0.583889,-0.070014,0.915732,0.700649,-1.626161,-0.74366,-0.372678,0.95702,-0.188025,-0.300708,-0.841625,4.101961,-0.121867,-0.207983,-0.201517,-0.201517,-0.541641,-0.049447,-0.149813,-0.166039,-0.795906,1.32351,-0.575473,1.0
2,-0.858027,0.195402,0.643024,-0.431787,-0.431787,1.345915,-0.832317,-0.724168,-1.053928,1.56291,1.729659,-0.948659,-0.486876,-0.049447,-0.149813,-0.100298,-0.393788,-0.371035,2.905658,-0.070014,0.915732,0.700649,-0.268267,-0.74366,2.683282,-1.04491,-0.188025,-0.300708,-0.841625,-0.243786,-0.121867,-0.207983,-0.201517,-0.201517,1.84624,-0.049447,-0.149813,-0.166039,-0.795906,-0.755567,1.737702,0.0
3,0.239648,-0.002458,0.695168,0.159307,0.159307,1.345915,0.28095,1.567759,0.790727,2.745707,0.511251,-0.948659,-0.486876,-0.049447,-0.149813,-0.100298,1.183375,-0.371035,-0.69056,-0.070014,-1.092023,0.700649,-0.268267,1.3447,-0.372678,-1.04491,-0.188025,-0.300708,-0.841625,-0.243786,-0.121867,-0.207983,4.962358,-0.201517,-0.541641,-0.049447,-0.149813,-0.166039,-0.795906,1.32351,-0.575473,0.0
4,0.12988,-0.185915,1.112315,-0.431787,-0.431787,1.345915,0.169667,1.567759,2.081985,-0.384941,-0.660571,0.77236,-0.486876,-0.049447,-0.149813,0.093239,0.14297,-0.371035,-0.0975,-0.070014,0.915732,0.700649,-1.626161,-0.74366,2.683282,-1.04491,-0.188025,-0.300708,1.188177,-0.243786,-0.121867,-0.207983,-0.201517,-0.201517,-0.541641,-0.049447,-0.149813,-0.166039,-0.795906,-0.755567,1.737702,1.0


In [5]:
data.shape

(410, 42)

In [6]:
data.loan_status_bin.value_counts()

1.0    359
0.0     51
Name: loan_status_bin, dtype: int64

In [7]:
#Null Model (or random investment) will yield, 59.3% of picking a good loan: 
data.loan_status_bin.value_counts()[1] / len(data)

0.875609756097561

In [8]:
X = data.drop('loan_status_bin', axis=1)
X.shape

(410, 41)

In [9]:
y = data['loan_status_bin']

In [10]:
n_features = 3
lr = LogisticRegression()
sfs = SequentialFeatureSelector(lr, n_features_to_select=n_features)

In [11]:
sfs.fit(X,y)

In [12]:
#sfs.get_support()

In [13]:
print("Top {} features selected by forward sequential selection:{}"\
      .format(n_features, list(X.columns[sfs.get_support()])))


Top 3 features selected by forward sequential selection:['loan_amnt', 'dti', 'fico_range_low']


In [14]:
feat = list(X.columns[sfs.get_support()])
feat

['loan_amnt', 'dti', 'fico_range_low']

In [15]:
X_feat = data[feat]

In [16]:
y = data['loan_status_bin']
y

0      0.0
1      1.0
2      0.0
3      0.0
4      1.0
      ... 
405    1.0
406    1.0
407    1.0
408    1.0
409    0.0
Name: loan_status_bin, Length: 410, dtype: float64

In [17]:
lr_check = LogisticRegression()
steps = [('model', lr_check)]
pipeline = Pipeline(steps=steps)

In [18]:
#Accuracy:

cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=1, random_state=1)
scores = cross_val_score(pipeline, X_feat, y, scoring='accuracy', cv=cv, n_jobs=-1) 
print(scores)

score = np.mean(scores)
print(score)

[0.86585366 0.87804878 0.87804878 0.87804878 0.87804878]
0.875609756097561


In [19]:
from sklearn.ensemble import AdaBoostClassifier
ada = AdaBoostClassifier()
steps2 = [('model', ada)]
pipeline2 = Pipeline(steps=steps2)

In [20]:
#Accuracy:

cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=1, random_state=1)
scores = cross_val_score(pipeline2, X_feat, y, scoring='accuracy', cv=cv, n_jobs=-1) 
print(scores)

score = np.mean(scores)
print(score)

[0.84146341 0.87804878 0.82926829 0.85365854 0.87804878]
0.8560975609756097


In [21]:
from sklearn.neural_network import MLPClassifier
nn = MLPClassifier(alpha=1, max_iter=1000)
steps3 = [('model', nn)]
pipeline3 = Pipeline(steps=steps3)

In [22]:
#Accuracy:

cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=1, random_state=1)
scores = cross_val_score(pipeline3, X_feat, y, scoring='accuracy', cv=cv, n_jobs=-1) 
print(scores)

score = np.mean(scores)
print(score)

[0.86585366 0.87804878 0.87804878 0.87804878 0.87804878]
0.875609756097561


In [23]:
from sklearn.ensemble import GradientBoostingClassifier
gb = GradientBoostingClassifier()
steps4 = [('model', gb)]
pipeline4 = Pipeline(steps=steps4)

In [24]:
#Accuracy:

cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=1, random_state=1)
scores = cross_val_score(pipeline4, X_feat, y, scoring='accuracy', cv=cv, n_jobs=-1) 
print(scores)

score = np.mean(scores)
print(score)

[0.81707317 0.84146341 0.87804878 0.87804878 0.86585366]
0.8560975609756097
