# CatBoost

In [44]:
import numpy as np
import pandas as pd
import seaborn as sns

from catboost import CatBoostClassifier

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score
from sklearn.metrics import (f1_score, roc_auc_score,confusion_matrix, accuracy_score,
                             precision_score, recall_score, matthews_corrcoef)
from IPython.display import HTML, display
import tabulate

## Read Data File

In [2]:
df = pd.read_csv('/media/abhishek/589E61B39E618A783/C-DAC Document/Practical Machine Learning/PML/Code/Dataset/loan_data.csv')

In [3]:
df.head(10)

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Loan_Status
0,5849.0,0.0,0.0,360.0,Yes,1
1,4583.0,1508.0,128.0,360.0,Yes,0
2,3000.0,0.0,66.0,360.0,Yes,1
3,2583.0,2358.0,120.0,360.0,Yes,1
4,6000.0,0.0,141.0,360.0,Yes,1
5,5417.0,4196.0,267.0,360.0,Yes,1
6,2333.0,1516.0,95.0,360.0,Yes,1
7,3036.0,2504.0,158.0,360.0,No,0
8,4006.0,1526.0,168.0,360.0,Yes,1
9,12841.0,10968.0,349.0,360.0,Yes,0


## EDA

In [4]:
print("Total number of rows in datasets = {} ".format(df.shape[0]))
print("Total number of columns in datasets = {} ".format(df.shape[1]))

Total number of rows in datasets = 614 
Total number of columns in datasets = 6 


In [5]:
df.isnull().sum()

ApplicantIncome      2
CoapplicantIncome    2
LoanAmount           3
Loan_Amount_Term     2
Credit_History       0
Loan_Status          0
dtype: int64

In [6]:
 df.nunique()

ApplicantIncome      503
CoapplicantIncome    287
LoanAmount           203
Loan_Amount_Term      11
Credit_History         2
Loan_Status            2
dtype: int64

In [7]:
 df.Loan_Amount_Term.value_counts(dropna = False)

360.0    511
180.0     43
480.0     15
0.0       14
300.0     13
240.0      4
84.0       4
120.0      3
60.0       2
NaN        2
36.0       2
12.0       1
Name: Loan_Amount_Term, dtype: int64

In [8]:
df.Loan_Status.value_counts()

1    422
0    192
Name: Loan_Status, dtype: int64

In [9]:
df.Credit_History.value_counts()

Yes    475
No     139
Name: Credit_History, dtype: int64

## Feature Engineering/Selection

### Drop 2 rows with missing values of Loan_Amount_Term

In [10]:
df.dropna(subset = ['Loan_Amount_Term'], inplace = True)

In [11]:
df.Loan_Amount_Term = df.Loan_Amount_Term.astype(int)

In [12]:
df.dtypes

ApplicantIncome      float64
CoapplicantIncome    float64
LoanAmount           float64
Loan_Amount_Term       int64
Credit_History        object
Loan_Status            int64
dtype: object

In [13]:
target_col = "Loan_Status"
x = df.loc[:, df.columns != target_col]
y = df.loc[:, target_col]

In [14]:
x_train, x_test, y_train, y_test = train_test_split(x, y, train_size = 0.8, random_state = 42)

In [15]:
x_test.head()

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History
81,2395.0,0.0,0.0,360,Yes
218,5000.0,0.0,72.0,360,No
55,2708.0,1167.0,97.0,360,Yes
602,5703.0,0.0,128.0,360,Yes
265,4095.0,3447.0,151.0,360,Yes


In [16]:
x_train.head()

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History
563,5800.0,0.0,132.0,360,Yes
289,9508.0,0.0,187.0,360,Yes
324,15000.0,0.0,300.0,360,Yes
132,2718.0,0.0,70.0,360,Yes
174,4344.0,736.0,87.0,360,Yes


## Train Catboost Model

In [40]:
model = CatBoostClassifier(task_type = 'CPU', iterations = 150, random_state = 2021, eval_metric = "Accuracy")

In [41]:
cat_features = ["Loan_Amount_Term", "Credit_History"]

In [42]:
model.fit(x_train, y_train, cat_features = cat_features, plot = True, eval_set = (x_test, y_test))

Custom logger is already specified. Specify more than one logger at same time is not thread safe.

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Learning rate set to 0.060613
0:	learn: 0.7689162	test: 0.7723577	best: 0.7723577 (0)	total: 619us	remaining: 92.3ms
1:	learn: 0.7689162	test: 0.7723577	best: 0.7723577 (0)	total: 1.31ms	remaining: 96.8ms
2:	learn: 0.7689162	test: 0.7723577	best: 0.7723577 (0)	total: 2.03ms	remaining: 99.7ms
3:	learn: 0.7689162	test: 0.7723577	best: 0.7723577 (0)	total: 2.65ms	remaining: 96.5ms
4:	learn: 0.7689162	test: 0.7723577	best: 0.7723577 (0)	total: 3.2ms	remaining: 92.7ms
5:	learn: 0.7689162	test: 0.7723577	best: 0.7723577 (0)	total: 3.74ms	remaining: 89.7ms
6:	learn: 0.7689162	test: 0.7723577	best: 0.7723577 (0)	total: 4.28ms	remaining: 87.3ms
7:	learn: 0.7689162	test: 0.7723577	best: 0.7723577 (0)	total: 4.83ms	remaining: 85.7ms
8:	learn: 0.7689162	test: 0.7723577	best: 0.7723577 (0)	total: 5.26ms	remaining: 82.4ms
9:	learn: 0.7689162	test: 0.7723577	best: 0.7723577 (0)	total: 5.8ms	remaining: 81.2ms
10:	learn: 0.7689162	test: 0.7723577	best: 0.7723577 (0)	total: 6.05ms	remaining: 76.4ms
11:	

<catboost.core.CatBoostClassifier at 0x7fbc6df494c0>

## Predict & Evaluate 

In [45]:
def predict_and_evaluate(model, X_test, y_test):
    '''Predict values for given model & test dataset
    and evaluate the results in terms of FP, FN, F1-score,
    Brier Score, AUC and G-Mean'''
    
    predictions = model.predict(X_test)
    
    accuracy = accuracy_score(y_test, predictions)
    precision = precision_score(y_test, predictions)
    recall = recall_score(y_test, predictions)
    f1 = f1_score(y_test, predictions)
#     mcc = matthews_corrcoef(y_test,predictions)
    tn, fp, fn, tp = confusion_matrix(y_test, predictions).ravel()
    
    metrics = [round(accuracy,2), fp, fn, round(precision,2), round(recall,2), round(f1,2)]
    table_row = [[model.__class__.__name__] + metrics]
    display(HTML(tabulate.tabulate(table_row,headers=('Algorithm','Accuracy', 'False Positives', 
                                                  'False Negatives', 'Precision', 
                                                  'Recall', 'F1 Score'), 
                                   tablefmt='html')))
    return table_row

In [46]:
res = predict_and_evaluate(model, x_test, y_test)

Algorithm,Accuracy,False Positives,False Negatives,Precision,Recall,F1 Score
CatBoostClassifier,0.77,20,8,0.79,0.9,0.84
