In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.simplefilter('ignore')

In [2]:
df = pd.read_csv('cleaned_loan.csv')
df.head()

Unnamed: 0,loan_amnt,term,int_rate,installment,home_ownership,annual_inc,verification_status,loan_status,purpose,total_pymnt,issue_month
0,-0.831337,36,-0.368123,-0.73433,4,-1.61181,2,2,1,-0.615834,12
1,-1.596084,60,0.872219,-1.867178,4,-1.195604,1,0,0,-2.080471,12
2,-1.63705,36,1.057465,-1.51184,4,-2.895639,0,2,11,-1.265084,12
3,0.081095,36,0.394338,0.306146,4,-0.290022,1,2,9,0.267863,12
4,-1.407828,60,0.179561,-1.741542,4,0.577374,1,1,9,-1.124963,12


In [3]:
#create X & y
X = df.drop(columns = ['loan_status'])
y = df['loan_status']

In [4]:
#train-test split
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state = True)

In [5]:
#Hypre-parameter tunning
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV

estimator = GradientBoostingClassifier(random_state = True)
param_grid = {'n_estimators':list(range(1,20)),
             'learning_rate':[0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1]}
gb_grid = GridSearchCV(estimator,param_grid,cv=5,scoring='accuracy')
gb_grid.fit(X_train,y_train)
gb_grid.best_params_

{'learning_rate': 0.7, 'n_estimators': 19}

In [9]:
gb_grid.best_estimator_.feature_importances_

array([0.02885872, 0.10396397, 0.00833148, 0.27472059, 0.        ,
       0.00188925, 0.        , 0.00059926, 0.57322641, 0.00841031])

In [10]:
feature = pd.DataFrame(data = gb_grid.best_estimator_.feature_importances_,
                       index = X_train.columns,
                       columns = ["Feature_importances"])
feature_imp = feature[feature["Feature_importances"] > 0]
imp_features = feature_imp.index.tolist()
imp_features

['loan_amnt',
 'term',
 'int_rate',
 'installment',
 'annual_inc',
 'purpose',
 'total_pymnt',
 'issue_month']

In [11]:
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
X3 = X[imp_features]

X3_train,X3_test,y3_train,y3_test = train_test_split(X3,y,test_size=0.2,random_state=True)

gb_model = GradientBoostingClassifier(n_estimators = 19,learning_rate = 0.7)
gb_model.fit(X3_train,y3_train)

y3_pred_train = gb_model.predict(X3_train)
y3_pred_test = gb_model.predict(X3_test)

print('train_accuracy',accuracy_score(y3_pred_train,y3_train))
print('test_accuracy',accuracy_score(y3_pred_test,y3_test))
print('cross_validation_score',cross_val_score(gb_model,X3_train,y3_train,scoring='accuracy',cv=5).mean())

train_accuracy 0.9455197809460864
test_accuracy 0.9373111782477341
cross_validation_score 0.939319642780637
