In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.simplefilter('ignore')

In [2]:
df = pd.read_csv('cleaned_loan.csv')
df.head()

Unnamed: 0,loan_amnt,term,int_rate,installment,home_ownership,annual_inc,verification_status,loan_status,purpose,total_pymnt,issue_month
0,-0.831337,36,-0.368123,-0.73433,4,-1.61181,2,2,1,-0.615834,12
1,-1.596084,60,0.872219,-1.867178,4,-1.195604,1,0,0,-2.080471,12
2,-1.63705,36,1.057465,-1.51184,4,-2.895639,0,2,11,-1.265084,12
3,0.081095,36,0.394338,0.306146,4,-0.290022,1,2,9,0.267863,12
4,-1.407828,60,0.179561,-1.741542,4,0.577374,1,1,9,-1.124963,12


In [3]:
#create X & y
X = df.drop(columns = ['loan_status'])
y = df['loan_status']

In [4]:
#train-test split
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state = True)

In [7]:
from sklearn.tree import DecisionTreeClassifier,plot_tree
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score

dt_model = DecisionTreeClassifier()
dt_model.fit(X_train,y_train)

ypred_train = dt_model.predict(X_train)
ypred_test = dt_model.predict(X_test)

print('train_accuracy:',accuracy_score(y_train,ypred_train))
print('test_accuracy:',accuracy_score(y_test,ypred_test))
print('cv_score:',cross_val_score(dt_model,X_train,y_train,cv=5,scoring = 'accuracy').mean())

train_accuracy: 1.0
test_accuracy: 0.9142749244712991
cv_score: 0.9156201672772462


In [13]:
print('max depth of tree:',dt_model.tree_.max_depth)

max depth of tree: 35


In [14]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV

estimator = DecisionTreeClassifier(random_state = True)
param_grid = {'criterion':['gini','entropy'],
              'max_depth':list(range(1,36))}
dt_grid = GridSearchCV(estimator,param_grid,cv=5,scoring = 'accuracy')
dt_grid.fit(X_train,y_train)
dt_grid.best_params_

{'criterion': 'entropy', 'max_depth': 9}

In [15]:
dt_grid.best_estimator_.feature_importances_

array([2.40504757e-02, 1.34821766e-01, 1.41097622e-02, 4.22961106e-01,
       2.90417644e-04, 2.52664961e-03, 5.51441097e-04, 1.44575023e-03,
       3.77753719e-01, 2.14889134e-02])

In [16]:
feature = pd.DataFrame(data = dt_grid.best_estimator_.feature_importances_,
                       index = X_train.columns,
                       columns = ['Feature_importance'])

feature_imp = feature[feature['Feature_importance'] > 0]
imp_feature = feature_imp.index.tolist()
imp_feature

['loan_amnt',
 'term',
 'int_rate',
 'installment',
 'home_ownership',
 'annual_inc',
 'verification_status',
 'purpose',
 'total_pymnt',
 'issue_month']

In [18]:
X1 = X[imp_feature]

X1_train,X1_test,y1_train,y1_test = train_test_split(X1,y,test_size=0.2,random_state=True)

dt_model = DecisionTreeClassifier(criterion = 'entropy',max_depth = 9,random_state = True)
dt_model.fit(X1_train,y1_train)

y_pred_train = dt_model.predict(X1_train)
y_pred_test = dt_model.predict(X1_test)

print('train_accuracy',accuracy_score(y_pred_train,y1_train))
print('test_accuracy',accuracy_score(y_pred_test,y1_test))
print('cv_score',cross_val_score(dt_model,X1_train,y1_train,cv=5,scoring='accuracy').mean())

train_accuracy 0.9479117489692506
test_accuracy 0.9369335347432024
cv_score 0.9395084506634154
