In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.simplefilter('ignore')

In [2]:
df = pd.read_csv('cleaned_loan.csv')
df.head()

Unnamed: 0,loan_amnt,term,int_rate,installment,home_ownership,annual_inc,verification_status,loan_status,purpose,total_pymnt,issue_month
0,-0.831337,36,-0.368123,-0.73433,4,-1.61181,2,2,1,-0.615834,12
1,-1.596084,60,0.872219,-1.867178,4,-1.195604,1,0,0,-2.080471,12
2,-1.63705,36,1.057465,-1.51184,4,-2.895639,0,2,11,-1.265084,12
3,0.081095,36,0.394338,0.306146,4,-0.290022,1,2,9,0.267863,12
4,-1.407828,60,0.179561,-1.741542,4,0.577374,1,1,9,-1.124963,12


In [3]:
#create X & y
X = df.drop(columns = ['loan_status'])
y = df['loan_status']

In [4]:
#train-test split
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state = True)

In [10]:
#Hypre-parameter tunning
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
estimator = XGBClassifier()
param_grid = {"n_estimators":[100,110,120,150],
             'max_depth':[3,4,5],
             'gamma':[0,0.15,0.3,0.5,1]}

xgb_grid = GridSearchCV(estimator,param_grid,cv=5,scoring='accuracy')
xgb_grid.fit(X_train,y_train)
xgb_grid.best_params_

{'gamma': 0, 'max_depth': 4, 'n_estimators': 120}

In [11]:
xgb_grid.best_estimator_.feature_importances_

array([0.14603339, 0.35555953, 0.03324141, 0.23312919, 0.01362222,
       0.01527295, 0.01058959, 0.01530295, 0.13958338, 0.03766532],
      dtype=float32)

In [12]:
feature = pd.DataFrame(data = xgb_grid.best_estimator_.feature_importances_,
                       index = X_train.columns,
                       columns = ["Feature_importances"])
feature_imp = feature[feature["Feature_importances"] > 0]
imp_features = feature_imp.index.tolist()
imp_features

['loan_amnt',
 'term',
 'int_rate',
 'installment',
 'home_ownership',
 'annual_inc',
 'verification_status',
 'purpose',
 'total_pymnt',
 'issue_month']

In [14]:
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
X4 = X[imp_features]

X4_train,X4_test,y4_train,y4_test = train_test_split(X4,y,test_size=0.2,random_state=True)

xgb_model = XGBClassifier(n_estimators = 120,max_depth = 4,gamma = 0)
xgb_model.fit(X4_train,y4_train)

y4_pred_train = xgb_model.predict(X4_train)
y4_pred_test = xgb_model.predict(X4_test)

print('train_accuracy',accuracy_score(y4_pred_train,y4_train))
print('test_accuracy',accuracy_score(y4_pred_test,y4_test))
print('cross_validation_score',cross_val_score(xgb_model,X4_train,y4_train,scoring='accuracy',cv=5).mean())

train_accuracy 0.9570704686368929
test_accuracy 0.9446122860020141
cross_validation_score 0.9459919360410819


In [15]:
import joblib
joblib.dump(xgb_model,'XGBoost,joblib')

['XGBoost,joblib']