In [139]:
# import itertools
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from tqdm.notebook import trange, tqdm
np.random.seed(22)

from lightgbm import LGBMClassifier
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold, RepeatedStratifiedKFold, cross_val_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn import preprocessing
from catboost import CatBoostClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn import metrics
from sklearn.metrics import jaccard_similarity_score, f1_score, log_loss, accuracy_score
from collections import Counter
import warnings

In [140]:
def plot_confusion_matrix(cm, classes,
                          normalize = False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.tight_layout()

In [141]:
def plot_loss_accuracy(history):
    historydf = pd.DataFrame(history.history, index=history.epoch)
    plt.figure(figsize=(8, 6))
    historydf.plot(ylim=(0, max(1, historydf.values.max())))
    loss = history.history['loss'][-1]
    acc = history.history['acc'][-1]
    plt.title('Loss: %.3f, Accuracy: %.3f' % (loss, acc))

In [142]:
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')
train_data.columns = train_data.columns.str.strip().str.lower().str.replace(' ', '_').str.replace('(', '').str.replace(')', '')
test_data.columns = test_data.columns.str.strip().str.lower().str.replace(' ', '_').str.replace('(', '').str.replace(')', '')
train_data.head()

Unnamed: 0,loan_id,gender,married,dependents,education,self_employed,applicantincome,coapplicantincome,loanamount,loan_amount_term,credit_history,property_area,loan_status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [143]:
train_data.shape

(614, 13)

In [144]:
train_data.isnull().sum()

loan_id               0
gender               13
married               3
dependents           15
education             0
self_employed        32
applicantincome       0
coapplicantincome     0
loanamount           22
loan_amount_term     14
credit_history       50
property_area         0
loan_status           0
dtype: int64

In [145]:
test_data.isnull().sum()

loan_id               0
gender               11
married               0
dependents           10
education             0
self_employed        23
applicantincome       0
coapplicantincome     0
loanamount            5
loan_amount_term      6
credit_history       29
property_area         0
dtype: int64

In [146]:
train_data.dtypes

loan_id               object
gender                object
married               object
dependents            object
education             object
self_employed         object
applicantincome        int64
coapplicantincome    float64
loanamount           float64
loan_amount_term     float64
credit_history       float64
property_area         object
loan_status           object
dtype: object

In [147]:
train_data.loc[(train_data['credit_history'].isnull()) & (train_data['loan_status'] == 'Y'), 'credit_history'] = 1.0
train_data.loc[(train_data['credit_history'].isnull()) & (train_data['loan_status'] == 'N'), 'credit_history'] = 0

train_data['credit_history'] = train_data['credit_history'].replace({1.0: 'Y', 0: 'N'})
test_data['credit_history'] = test_data['credit_history'].replace({1.0: 'Y', 0: 'N'})

In [148]:
# loan_avg = train_data.pivot_table(index = ['education', 'self_employed'], values = 'loanamount', aggfunc = np.mean)

train_data["gender"].fillna('NaN', inplace = True)
test_data["gender"].fillna('NaN', inplace = True)
train_data["self_employed"].fillna('NaN', inplace = True)
test_data["self_employed"].fillna('NaN', inplace = True)
train_data["married"].fillna('NaN', inplace = True)
train_data["dependents"].fillna('NaN', inplace = True)
test_data["dependents"].fillna('NaN', inplace = True)

## Let Catboost handle nulls in credit history
test_data["credit_history"].fillna('NaN', inplace = True)

# Handle outliers in Loan Amount
medianLoanAmt = train_data.loc[train_data['loanamount'] < 300, 'loanamount'].median()
train_data["loanamount"] = np.where(train_data["loanamount"] > 300, medianLoanAmt, train_data['loanamount'])
test_data["loanamount"] = np.where(test_data["loanamount"] > 300, medianLoanAmt, test_data['loanamount'])

#train_data = train_data.fillna(train_data.mean())
#train_data["loanamount"] = train_data.groupby("married")["loanamount"].transform(lambda x: x.fillna(x.mean()))
#test_data["loanamount"] = test_data.groupby("married")["loanamount"].transform(lambda x: x.fillna(x.mean()))
train_data["loanamount"] = train_data["loanamount"].transform(lambda x: x.fillna(x.median()))
test_data["loanamount"] = test_data["loanamount"].transform(lambda x: x.fillna(x.median()))

train_data["total_income"] = train_data["applicantincome"] + train_data["coapplicantincome"]
test_data["total_income"] = test_data["applicantincome"] + test_data["coapplicantincome"]

train_data["loan_amount_term"] = train_data["loan_amount_term"].transform(lambda x: x.fillna(x.median()))
test_data["loan_amount_term"] = test_data["loan_amount_term"].transform(lambda x: x.fillna(x.median()))

#test_data['loanamount'].fillna((test_data['loanamount'].mean()), inplace = True)
#test_data['loan_amount_term'].fillna((test_data['loan_amount_term'].mean()), inplace = True)
# test_data["credit_history"].fillna(1, inplace = True)

train_data = train_data.drop(["loan_id"], axis = 1)
loan_ids = test_data["loan_id"].values
test_data = test_data.drop(["loan_id"], axis = 1)

In [149]:
ss = StandardScaler()
scale_features = ['applicantincome', 'coapplicantincome', 'loan_amount_term', 'loanamount', 'total_income']
train_data[scale_features] = ss.fit_transform(train_data[scale_features])

In [150]:
# Defining X, y and splittinf the training data

X = train_data.drop('loan_status', axis = 1).values
y = train_data['loan_status'].values

# Normalize data
# Scaling the features within range (0, 1)

#X = preprocessing.StandardScaler().fit(X).transform(X)

#X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 4, test_size = 0.33, shuffle = True)

In [151]:
kfold, scores = KFold(n_splits = 5, shuffle = True, random_state = 22), list()
for train, test in kfold.split(X):
    X_train, x_test = X[train], X[test]
    y_train, y_test = y[train], y[test]
    
    model = CatBoostClassifier(random_state = 22, max_depth = 6, n_estimators = 100, verbose = 500)
    model.fit(X_train, y_train, cat_features=[0, 1, 2, 3, 4, 9, 10])
    preds = model.predict(x_test)
    score = accuracy_score(y_test, preds)
    scores.append(score)
    print('Validation Accuracy:', score)
print("Average Validation Accuracy: ", sum(scores)/len(scores))

Learning rate set to 0.06281
0:	learn: 0.6702656	total: 8.01ms	remaining: 793ms
99:	learn: 0.3894223	total: 338ms	remaining: 0us
Validation Accuracy: 0.8536585365853658
Learning rate set to 0.06281
0:	learn: 0.6680986	total: 5.36ms	remaining: 531ms
99:	learn: 0.3753722	total: 345ms	remaining: 0us
Validation Accuracy: 0.8048780487804879
Learning rate set to 0.06281
0:	learn: 0.6692086	total: 7.39ms	remaining: 732ms
99:	learn: 0.3703042	total: 719ms	remaining: 0us
Validation Accuracy: 0.7967479674796748
Learning rate set to 0.06281
0:	learn: 0.6713113	total: 7.24ms	remaining: 717ms
99:	learn: 0.3885216	total: 329ms	remaining: 0us
Validation Accuracy: 0.8373983739837398
Learning rate set to 0.062864
0:	learn: 0.6729930	total: 11ms	remaining: 1.09s
99:	learn: 0.3950406	total: 341ms	remaining: 0us
Validation Accuracy: 0.860655737704918
Average Validation Accuracy:  0.8306677329068373


In [152]:
test_preds = model.predict(test_data)

In [153]:
test_preds[0:5]

array(['Y', 'Y', 'Y', 'Y', 'Y'], dtype=object)

In [154]:
submission_df = pd.DataFrame({
                  "Loan_ID": loan_ids, 
                  "Loan_Status": test_preds.flatten()})

In [155]:
submission_df.to_csv('submission_CB.csv', index = False)

In [157]:
# Label Encoding

le = LabelEncoder()
train_data["education"] = le.fit_transform(train_data["education"])
test_data["education"] = le.transform(test_data["education"])
train_data["gender"] = le.fit_transform(train_data["gender"])
test_data["gender"] = le.transform(test_data["gender"])
train_data["married"] = le.fit_transform(train_data["married"])
test_data["married"] = le.transform(test_data["married"])
train_data["dependents"] = le.fit_transform(train_data["dependents"])
test_data["dependents"] = le.transform(test_data["dependents"])
train_data["property_area"] = le.fit_transform(train_data["property_area"])
test_data["property_area"] = le.transform(test_data["property_area"])
train_data["credit_history"] = le.fit_transform(train_data["credit_history"])
test_data["credit_history"] = le.transform(test_data["credit_history"])
train_data["self_employed"] = le.fit_transform(train_data["self_employed"])
test_data["self_employed"] = le.transform(test_data["self_employed"])
train_data.head()

Unnamed: 0,gender,married,dependents,education,self_employed,applicantincome,coapplicantincome,loanamount,loan_amount_term,credit_history,property_area,loan_status,total_income
0,1,1,0,0,1,0.072991,-0.554487,-0.134389,0.273231,1,2,Y,-0.182184
1,1,2,1,0,1,-0.134412,-0.038732,-0.063803,0.273231,1,0,N,-0.144684
2,1,2,0,0,2,-0.393747,-0.554487,-1.314177,0.273231,1,2,Y,-0.623656
3,1,2,0,1,1,-0.462062,0.25198,-0.225142,0.273231,1,2,Y,-0.322885
4,1,1,0,0,1,0.097728,-0.554487,0.198372,0.273231,1,2,Y,-0.158785


In [158]:
X_train, Y = train_data.drop(["loan_status"], axis = 1).values, train_data["loan_status"].values
X_test = test_data.values

X_train.shape, Y.shape, X_test.shape

((614, 12), (614,), (367, 12))

In [190]:
num_class2

155

In [192]:
kfold, scores = KFold(n_splits = 5, shuffle = True, random_state = 22), list()
for train, test in kfold.split(X_train):
    x_train, x_test = X_train[train], X_train[test]
    y_train, y_test = Y[train], Y[test]

    num_class1, num_class2 = Counter(y_train)['Y'], Counter(y_train)['N']
    sm = SMOTE(random_state = 22, sampling_strategy = {'Y': int(1.0*num_class1), 'N': int(2.2*num_class2)})
    x_train, y_train = sm.fit_resample(x_train, y_train)
    
    LGBM_model = LGBMClassifier(random_state = 22, max_depth = 6, n_estimators = 400)
    LGBM_model.fit(x_train, y_train, categorical_feature = [0, 1, 2, 3, 4, 9, 10])
    preds = LGBM_model.predict(x_test)
    score = accuracy_score(y_test, preds)
    scores.append(score)
    print('Validation Accuracy:', score)
print("Average Validation Accuracy: ", sum(scores)/len(scores))

New categorical_feature is [0, 1, 2, 3, 4, 9, 10]
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


Validation Accuracy: 0.7967479674796748


  n_samples_majority))
New categorical_feature is [0, 1, 2, 3, 4, 9, 10]
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


Validation Accuracy: 0.6829268292682927


New categorical_feature is [0, 1, 2, 3, 4, 9, 10]
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


Validation Accuracy: 0.7642276422764228


New categorical_feature is [0, 1, 2, 3, 4, 9, 10]
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


Validation Accuracy: 0.7804878048780488


  n_samples_majority))
New categorical_feature is [0, 1, 2, 3, 4, 9, 10]
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


Validation Accuracy: 0.7622950819672131
Average Validation Accuracy:  0.7573370651739304
