In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
data = pd.read_csv('../../../Data/Combo_4_DropNAs.csv')

In [3]:
data

Unnamed: 0,Risk_Score,DI,Loan Type,Employment Length,Accepted
0,677.0,0.065455,Debt Consolidation,10+ years,1
1,717.0,0.380000,Business Loan,10+ years,1
2,697.0,0.317460,Home Improvement,10+ years,1
3,787.0,0.318182,Debt Consolidation,10+ years,1
4,697.0,0.099585,Major Purchase,3 years,1
...,...,...,...,...,...
11105947,681.0,0.551500,Debt Consolidation,< 1 year,0
11105948,531.0,0.313100,Debt Consolidation,< 1 year,0
11105949,590.0,0.412600,Debt Consolidation,< 1 year,0
11105950,686.0,0.102600,Other,< 1 year,0


In [4]:
data.dtypes

Risk_Score           float64
DI                   float64
Loan Type             object
Employment Length     object
Accepted               int64
dtype: object

In [5]:
cat_features = ['Loan Type', 'Employment Length']
print(cat_features)

['Loan Type', 'Employment Length']


In [6]:
y = data.Accepted
X = data.drop('Accepted', axis=1)
print (y.shape)
print (X.shape)

(11105952,)
(11105952, 4)


In [7]:
print('Labels: {}'.format(set(y)))
print('Zero count or Rejected = {}, One count or Accepted = {}'.format(len(y) - sum(y), sum(y)))
print('Ratio of One Count or Accepted = {}'.format(sum(y)/len(y)))

Labels: {0, 1}
Zero count or Rejected = 8992195, One count or Accepted = 2113757
Ratio of One Count or Accepted = 0.1903265024015951


In [8]:
from catboost import CatBoostClassifier
cb = CatBoostClassifier(iterations=100)
cb.fit(X, y, cat_features = cat_features, verbose = False)

<catboost.core.CatBoostClassifier at 0x7f8188035c70>

In [9]:
cb.predict(X)

array([1, 1, 1, ..., 0, 0, 0])

In [10]:
y

0           1
1           1
2           1
3           1
4           1
           ..
11105947    0
11105948    0
11105949    0
11105950    0
11105951    0
Name: Accepted, Length: 11105952, dtype: int64

In [11]:
cb.score(X,y)

0.9755720175992116

In [12]:
from sklearn.metrics import confusion_matrix
confusion_matrix_f = confusion_matrix(y, cb.predict(X))
confusion_matrix_f

array([[8883738,  108457],
       [ 162839, 1950918]])

In [13]:
confusion_matrix_full = pd.DataFrame(confusion_matrix_f,
                                     index=["Actual_Rejected","Actual_Accepted"],
                                     columns=["Predicted_Rejected","Predicted_Accepted"])

confusion_matrix_full

Unnamed: 0,Predicted_Rejected,Predicted_Accepted
Actual_Rejected,8883738,108457
Actual_Accepted,162839,1950918


In [14]:
Total_Rejected = 8883738 + 108457
Total_Rejected

8992195

In [15]:
#Cross Check:
confusion_matrix_full.iloc[0,:].sum()

8992195

In [16]:
Total_Accepted = 162839 + 1950918
Total_Accepted

2113757

In [17]:
#Cross Check:
confusion_matrix_full.iloc[1,:].sum()

2113757

In [18]:
data.Accepted.value_counts()

0    8992195
1    2113757
Name: Accepted, dtype: int64

In [19]:
#Accuracy Cross Check: 
(confusion_matrix_full.iloc[0,0] + confusion_matrix_full.iloc[1,1]) / np.sum(confusion_matrix_full).sum()

0.9755720175992116

In [20]:
cb.score(X,y)

0.9755720175992116

In [21]:
#Sensitivity / Recall == (TP / (TP+FN))
TN, FP, FN, TP = confusion_matrix_f.ravel()
print(TN)
print(FP)
print(FN)
print(TP)

8883738
108457
162839
1950918


In [22]:
confusion_matrix_full

Unnamed: 0,Predicted_Rejected,Predicted_Accepted
Actual_Rejected,8883738,108457
Actual_Accepted,162839,1950918


In [23]:
#True Positive refers to Accepted Loans that were correctly predicted in this example
#False Negative refers to Accepted Loans that were incorrectly predicted in this example

Rec = TP / (TP+FN)
Rec

0.9229622894211587

In [24]:
#Cross Check "Recall" (or Sensitivity) with Catboost built-in eval metrics:
from catboost.utils import eval_metric

Recall = eval_metric(y, cb.predict(X), 'Recall')
print(Recall)

[0.9229622894211587]


In [25]:
#Specificity, not native to Catboost(?)

In [26]:
Spec = TN / (FP+TN)
Spec

0.9879387624489905

In [27]:
#Precision

In [28]:
Prec = TP / (TP + FP)
Prec

0.9473349924127465

In [29]:
Precision = eval_metric(y, cb.predict(X), 'Precision')
Precision

[0.9473349924127465]

In [30]:
#F Measure

In [31]:
F = 2 * Prec * Rec / (Prec + Rec)
F

0.9349898349728694

In [54]:
F1 = eval_metric(y, cb.predict(X), 'F1')
print(F1)

[0.9005257439695977]


In [33]:
from imblearn.over_sampling import RandomOverSampler
oversample = RandomOverSampler(sampling_strategy='minority')
X_os, y_os = oversample.fit_resample(X, y)

In [35]:
print('Labels: {}'.format(set(y_os)))
print('Zero count = {}, One count = {}'.format(len(y_os) - sum(y_os), sum(y_os)))
print('Ratio of One Count = {}'.format(sum(y_os)/len(y_os)))
#print('Ratio of Train to Test = {}'.format(len(y_train_os)/(len(y_train_os)+len(y_test))))

Labels: {0, 1}
Zero count = 8992195, One count = 8992195
Ratio of One Count = 0.5


In [36]:
cb.fit(X_os, y_os, cat_features = cat_features, verbose=10)

Learning rate set to 0.5
0:	learn: 0.3946337	total: 3.44s	remaining: 5m 40s
10:	learn: 0.1370267	total: 21.7s	remaining: 2m 55s
20:	learn: 0.0936371	total: 39.8s	remaining: 2m 29s
30:	learn: 0.0813565	total: 56.1s	remaining: 2m 4s
40:	learn: 0.0764525	total: 1m 11s	remaining: 1m 42s
50:	learn: 0.0737169	total: 1m 26s	remaining: 1m 23s
60:	learn: 0.0718632	total: 1m 42s	remaining: 1m 5s
70:	learn: 0.0702529	total: 1m 55s	remaining: 47.4s
80:	learn: 0.0693552	total: 2m 11s	remaining: 30.8s
90:	learn: 0.0686134	total: 2m 27s	remaining: 14.6s
99:	learn: 0.0682024	total: 2m 38s	remaining: 0us


<catboost.core.CatBoostClassifier at 0x7f8188035c70>

In [37]:
cb.score(X_os, y_os)

0.9284829788499915

In [38]:
confusion_matrix_f = confusion_matrix(y_os, cb.predict(X_os))
confusion_matrix_f

array([[8854801,  137394],
       [1148796, 7843399]])

In [39]:
confusion_matrix_full = pd.DataFrame(confusion_matrix_f,
                                     index=["Actual_Rejected","Actual_Accepted"],
                                     columns=["Predicted_Rejected","Predicted_Accepted"])

confusion_matrix_full

Unnamed: 0,Predicted_Rejected,Predicted_Accepted
Actual_Rejected,8854801,137394
Actual_Accepted,1148796,7843399


In [40]:
#Cross Check:
confusion_matrix_full.iloc[0,:].sum()

8992195

In [41]:
#Cross Check:
confusion_matrix_full.iloc[1,:].sum()

8992195

In [42]:
#Accuracy Cross Check: 
(confusion_matrix_full.iloc[0,0] + confusion_matrix_full.iloc[1,1]) / np.sum(confusion_matrix_full).sum()

0.9284829788499915

In [43]:
#Sensitivity / Recall == (TP / (TP+FN))
TN, FP, FN, TP = confusion_matrix_f.ravel()
print(TN)
print(FP)
print(FN)
print(TP)

8854801
137394
1148796
7843399


In [44]:
#True Positive refers to Accepted Loans that were correctly predicted in this example
#False Negative refers to Accepted Loans that were incorrectly predicted in this example

Rec = TP / (TP+FN)
Rec

0.8722452082055605

In [49]:
#Cross Check "Recall" (or Sensitivity) with Catboost built-in eval metrics:
#from catboost.utils import eval_metric

Recall = eval_metric(y_os, cb.predict(X_os), 'Recall')
print(Recall)

[0.8722452082055605]


In [46]:
Spec = TN / (FP+TN)
Spec

0.9847207494944227

In [47]:
Prec = TP / (TP + FP)
Prec

0.9827844175384577

In [50]:
Precision = eval_metric(y_os, cb.predict(X_os), 'Precision')
Precision

[0.9827844175384577]

In [51]:
F = 2 * Prec * Rec / (Prec + Rec)
F

0.9242213568995631

In [53]:
F1 = eval_metric(y_os, cb.predict(X_os), 'F1')
print(F1)

[0.9242213568995631]


In [None]:
from sklearn.metrics import confusion_matrix

print("TRAIN PERFORMANCE:")
confusion_matrix_train = confusion_matrix(y_train, cb.predict(X_train))
confusion_matrix_train = pd.DataFrame(confusion_matrix_train,
                                      index=["Actual_No","Actual_Yes"],
                                      columns=["Predicted_No","Predicted_Yes"])

display(confusion_matrix_train)

#recall_resignation_train = confusion_matrix_train.iloc[1,1] / confusion_matrix_train.iloc[1,:].sum()

print("Train Score Accuracy: {}".format(round(cb.score(X_train,y_train),3)))
#print("Recall Train: {}".format(round(recall_resignation_train,3)))

print("\n* * * * * * * * * * * * * * * * * * *\n")
print(' ')
print("TEST PERFORMANCE:")
confusion_matrix_test = confusion_matrix(y_test, cb.predict(X_test))
confusion_matrix_test = pd.DataFrame(confusion_matrix_test,
                                     index=["Actual_No","Actual_Yes"],
                                     columns=["Predicted_No","Predicted_Yes"])

display(confusion_matrix_test)

#recall_resignation_test = confusion_matrix_test.iloc[1,1] / confusion_matrix_test.iloc[1,:].sum()

print("Test Score Accuracy: {}".format(round(cb.score(X_test,y_test),3)))
#print("Recall Test: {}".format(round(recall_resignation_test,3)))

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3, stratify=y)
print('Labels: {}'.format(set(y_train)))
print('Zero count (Train) = {}, One count (Train) = {}'.format(len(y_train) - sum(y_train), sum(y_train)))
print('Ratio of One Count = {}'.format(sum(y_train)/len(y_train)))
print('Ratio of Train to Test = {}'.format(len(y_train)/(len(y_train)+len(y_test))))
#print('Ratio of Train to Test = {}'.format(len(X_train)/(len(X_train)+len(X_test))))

In [None]:
#X_train.head(2)

In [None]:
from catboost import CatBoostClassifier
cb = CatBoostClassifier(iterations=100)
cb.fit(X_train, y_train, cat_features = cat_features, verbose = False)

In [None]:
from sklearn.metrics import confusion_matrix

print("TRAIN PERFORMANCE:")
confusion_matrix_train = confusion_matrix(y_train, cb.predict(X_train))
confusion_matrix_train = pd.DataFrame(confusion_matrix_train,
                                      index=["Actual_No","Actual_Yes"],
                                      columns=["Predicted_No","Predicted_Yes"])

display(confusion_matrix_train)

#recall_resignation_train = confusion_matrix_train.iloc[1,1] / confusion_matrix_train.iloc[1,:].sum()

print("Train Score Accuracy: {}".format(round(cb.score(X_train,y_train),3)))
#print("Recall Train: {}".format(round(recall_resignation_train,3)))

print("\n* * * * * * * * * * * * * * * * * * *\n")
print(' ')
print("TEST PERFORMANCE:")
confusion_matrix_test = confusion_matrix(y_test, cb.predict(X_test))
confusion_matrix_test = pd.DataFrame(confusion_matrix_test,
                                     index=["Actual_No","Actual_Yes"],
                                     columns=["Predicted_No","Predicted_Yes"])

display(confusion_matrix_test)

#recall_resignation_test = confusion_matrix_test.iloc[1,1] / confusion_matrix_test.iloc[1,:].sum()

print("Test Score Accuracy: {}".format(round(cb.score(X_test,y_test),3)))
#print("Recall Test: {}".format(round(recall_resignation_test,3)))

In [None]:
#Train Score Accuracy:
(6219598 + 1365635) / (6219598 + 1365635 + 113995 + 74938)

In [None]:
#Test Score Accuracy
(2665762 + 585185) / (2665762 + 585185 + 31897 + 48942)

In [None]:
confusion_matrix(y_train, cb.predict(X_train)).ravel()

In [None]:
confusion_matrix(y_train, cb.predict(X_train))

In [None]:
#Matches Train Actual 'No' (0 Count, Rejected Loans)
6219598 + 74938

In [None]:
#Matches Train Actual 'Yes' (1 Count, Accepted Loans)
113995 + 1365635

In [None]:
from catboost import Pool
#pool = Pool(data=X, label=y, cat_features = cat_features)

In [None]:
train_pool = Pool(data=X_train, label=y_train, cat_features=cat_features)
test_pool = Pool(data=X_test, label=y_test, cat_features=cat_features)

In [None]:
cb.fit(train_pool, eval_set=test_pool, verbose=10)
#Log Loss is the "default optimizer for binary classification in CatBoost"

In [None]:
print('Model is fitted: {}'.format(cb.is_fitted()))
print('Model params:\n{}'.format(cb.get_params()))

In [None]:
cb2 = CatBoostClassifier(iterations=100, custom_loss=['AUC', 'Accuracy'])

cb2.fit(train_pool, eval_set = test_pool, verbose=False, plot=True);

In [None]:
import shap

In [None]:
feature_importance = cb.feature_importances_
sorted_idx = np.argsort(feature_importance)
fig = plt.figure(figsize=(12, 6))
plt.barh(range(len(sorted_idx)), feature_importance[sorted_idx], align='center')
plt.yticks(range(len(sorted_idx)), np.array(X_test.columns)[sorted_idx])
plt.title('Feature Importance');

In [None]:
feature_importance

In [None]:
feature_importance = cb2.feature_importances_
sorted_idx = np.argsort(feature_importance)
fig = plt.figure(figsize=(12, 6))
plt.barh(range(len(sorted_idx)), feature_importance[sorted_idx], align='center')
plt.yticks(range(len(sorted_idx)), np.array(X_test.columns)[sorted_idx])
plt.title('Feature Importance');

In [None]:
explainer = shap.Explainer(cb)
shap_values = explainer(X_test)
shap_importance = shap_values.abs.mean(0).values
sorted_idx = shap_importance.argsort()
fig = plt.figure(figsize=(12, 6))
plt.barh(range(len(sorted_idx)), shap_importance[sorted_idx], align='center')
plt.yticks(range(len(sorted_idx)), np.array(X_test.columns)[sorted_idx])
plt.title('SHAP Importance')

In [None]:
explainer = shap.Explainer(cb2)
shap_values = explainer(X_test)
shap_importance = shap_values.abs.mean(0).values
sorted_idx = shap_importance.argsort()
fig = plt.figure(figsize=(12, 6))
plt.barh(range(len(sorted_idx)), shap_importance[sorted_idx], align='center')
plt.yticks(range(len(sorted_idx)), np.array(X_test.columns)[sorted_idx])
plt.title('SHAP Importance')

In [None]:
shap.plots.bar(shap_values, max_display=X_test.shape[0])

In [None]:
shap.summary_plot(shap_values, max_display=X_test.shape[0])