In [33]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.model_selection import train_test_split as tts
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import roc_auc_score
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import accuracy_score

In [34]:
df = pd.read_csv('preprocessed.csv')

In [35]:

df = df.drop(['ORGANIZATION_TYPE'], axis=1)

a = ['AMT_APPLICATION_APPROVED', 'AMT_APPLICATION_REFUSED', 'AMT_APPLICATION_CANCELED', 
     'AMT_APPLICATION_UNUSED', 'CODE_GENDER', 'CREDIT_ACTIVE_CLOSED', 'CREDIT_ACTIVE_ACTIVE', 
     'CREDIT_ACTIVE_SOLD', 'CREDIT_ACTIVE_BAD_DEBT', 'CREDIT_DAY_OVERDUE']

for col in a:
    df = df[df[col].notna()]
    
df['AVG_AMT_CREDIT_SUM'].fillna(df['AVG_AMT_CREDIT_SUM'].mean(), inplace=True)
df['AGE_BUCKET'].fillna(0, inplace=True)


In [36]:
df.info(verbose=True, show_counts=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 210843 entries, 0 to 258060
Data columns (total 83 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   Unnamed: 0                   210843 non-null  int64  
 1   TARGET                       210843 non-null  int64  
 2   NAME_CONTRACT_TYPE           210843 non-null  object 
 3   CODE_GENDER                  210843 non-null  object 
 4   FLAG_OWN_CAR                 210843 non-null  object 
 5   FLAG_OWN_REALTY              210843 non-null  object 
 6   CNT_CHILDREN                 210843 non-null  int64  
 7   AMT_INCOME_TOTAL             210843 non-null  float64
 8   AMT_CREDIT                   210843 non-null  float64
 9   AMT_ANNUITY                  210843 non-null  float64
 10  AMT_GOODS_PRICE              210843 non-null  float64
 11  NAME_TYPE_SUITE              210843 non-null  object 
 12  NAME_INCOME_TYPE             210843 non-null  object 
 13 

In [37]:
from catboost import CatBoostClassifier, Pool

In [38]:
x_train, x_test, y_train, y_test = tts(df.drop('TARGET', axis=1), df['TARGET'], test_size = 0.2, random_state=0)

In [39]:
x_train.info(verbose=True, show_counts=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 168674 entries, 258016 to 244009
Data columns (total 82 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   Unnamed: 0                   168674 non-null  int64  
 1   NAME_CONTRACT_TYPE           168674 non-null  object 
 2   CODE_GENDER                  168674 non-null  object 
 3   FLAG_OWN_CAR                 168674 non-null  object 
 4   FLAG_OWN_REALTY              168674 non-null  object 
 5   CNT_CHILDREN                 168674 non-null  int64  
 6   AMT_INCOME_TOTAL             168674 non-null  float64
 7   AMT_CREDIT                   168674 non-null  float64
 8   AMT_ANNUITY                  168674 non-null  float64
 9   AMT_GOODS_PRICE              168674 non-null  float64
 10  NAME_TYPE_SUITE              168674 non-null  object 
 11  NAME_INCOME_TYPE             168674 non-null  object 
 12  NAME_EDUCATION_TYPE          168674 non-null  object 

In [40]:
cat_fet = [1,2,3,4,10,11,12,13,14,26,30,81]

In [41]:
model = CatBoostClassifier(custom_loss=['AUC', 'F1', 'Precision', 'Recall'])

In [42]:
model.fit(x_train,y_train,cat_fet)

Learning rate set to 0.09202
0:	learn: 0.6034611	total: 254ms	remaining: 4m 13s
1:	learn: 0.5296349	total: 353ms	remaining: 2m 56s
2:	learn: 0.4718971	total: 457ms	remaining: 2m 31s
3:	learn: 0.4275622	total: 554ms	remaining: 2m 17s
4:	learn: 0.3941380	total: 651ms	remaining: 2m 9s
5:	learn: 0.3671270	total: 738ms	remaining: 2m 2s
6:	learn: 0.3465749	total: 824ms	remaining: 1m 56s
7:	learn: 0.3304080	total: 911ms	remaining: 1m 53s
8:	learn: 0.3167322	total: 1.01s	remaining: 1m 51s
9:	learn: 0.3062843	total: 1.11s	remaining: 1m 50s
10:	learn: 0.2970931	total: 1.2s	remaining: 1m 47s
11:	learn: 0.2909928	total: 1.3s	remaining: 1m 46s
12:	learn: 0.2846568	total: 1.38s	remaining: 1m 44s
13:	learn: 0.2799973	total: 1.48s	remaining: 1m 44s
14:	learn: 0.2759136	total: 1.57s	remaining: 1m 43s
15:	learn: 0.2728325	total: 1.66s	remaining: 1m 41s
16:	learn: 0.2697695	total: 1.74s	remaining: 1m 40s
17:	learn: 0.2672607	total: 1.83s	remaining: 1m 40s
18:	learn: 0.2653168	total: 1.93s	remaining: 1m 3

<catboost.core.CatBoostClassifier at 0x22006d5eeb0>

In [43]:
y_pred = model.predict(x_test)
print('Results of catboost on raw data')
print('Accuracy - ',accuracy_score(y_test,y_pred))
print('Balanced Accuracy - ',balanced_accuracy_score(y_test,y_pred))
print('ROC-AUC - ',roc_auc_score(y_test, model.predict_proba(x_test)[:, 1]))

Results of catboost on raw data
Accuracy -  0.9202731864639901
Balanced Accuracy -  0.5085577137348056
ROC-AUC -  0.7373347323666608


In [45]:
zero = df[df['TARGET'] == 0].head(22393)
one = df[df['TARGET'] == 1]
half = pd.concat([zero, one])

x_bal = half.drop(['TARGET'],axis=1)
y_bal = half['TARGET']
x_train, x_test, y_train, y_test = tts(x_bal, y_bal, test_size = 0.2, random_state=0)
model = CatBoostClassifier(custom_loss=['AUC', 'F1', 'Precision', 'Recall'])
model.fit(x_train,y_train,cat_fet)

Learning rate set to 0.044818
0:	learn: 0.6169707	total: 34.7ms	remaining: 34.6s
1:	learn: 0.5379007	total: 70.9ms	remaining: 35.4s
2:	learn: 0.4867585	total: 116ms	remaining: 38.7s
3:	learn: 0.4337613	total: 155ms	remaining: 38.5s
4:	learn: 0.3943494	total: 187ms	remaining: 37.3s
5:	learn: 0.3662979	total: 222ms	remaining: 36.7s
6:	learn: 0.3408775	total: 255ms	remaining: 36.2s
7:	learn: 0.3201180	total: 293ms	remaining: 36.4s
8:	learn: 0.3020075	total: 325ms	remaining: 35.8s
9:	learn: 0.2841210	total: 364ms	remaining: 36s
10:	learn: 0.2687271	total: 402ms	remaining: 36.2s
11:	learn: 0.2561953	total: 437ms	remaining: 36s
12:	learn: 0.2458173	total: 470ms	remaining: 35.7s
13:	learn: 0.2378595	total: 504ms	remaining: 35.5s
14:	learn: 0.2294152	total: 541ms	remaining: 35.5s
15:	learn: 0.2220776	total: 576ms	remaining: 35.4s
16:	learn: 0.2171105	total: 611ms	remaining: 35.3s
17:	learn: 0.2123471	total: 648ms	remaining: 35.4s
18:	learn: 0.2084167	total: 686ms	remaining: 35.4s
19:	learn: 0.

<catboost.core.CatBoostClassifier at 0x220003c2f70>

In [46]:
y_pred = model.predict(x_test)
print('Results of catboost on balanced data')
print('Accuracy - ',accuracy_score(y_test,y_pred))
print('Balanced Accuracy - ',balanced_accuracy_score(y_test,y_pred))
print('ROC-AUC - ',roc_auc_score(y_test, model.predict_proba(x_test)[:, 1]))

Results of catboost on balanced data
Accuracy -  0.947711582715418
Balanced Accuracy -  0.9394890788281144
ROC-AUC -  0.9621702040929335
