In [7]:

# -*- coding: utf-8 -*-
"""

1. The script uses Python version=3.5.5, Keras=2.2.0, TensorFlow=1.8.0.
1. Input data should be in the same parent folder as this python script. 
    Parent directory has input files. Its Sub-directory should have this python script.
2. The code will create save_folder="CreditDeFaulter", 
    containing Confusion Matrix/Output file reporting accuracies/ NN model checkpoints

"""
# will ignore unnecessary scikit-learn warnings
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

import os
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from pathlib import Path
import itertools
#from collections import Counter
#import missingno as msno
from datetime import datetime

from sklearn.utils import shuffle
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, MaxAbsScaler 
from sklearn.model_selection import train_test_split, StratifiedKFold, KFold, StratifiedShuffleSplit
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.linear_model import LogisticRegression

#from keras import regularizers
from keras.models import Sequential, load_model
from keras.optimizers import Adam #SGD,,RMSprop
from keras.layers import Dense, Dropout, Activation
from keras.callbacks import EarlyStopping, ModelCheckpoint

import xgboost
import json



from numpy.random import seed
seed(1)
from tensorflow import set_random_seed
set_random_seed(2)
seed=1

targetName, Tosave_Folder="loan_status", "Credit_Defaulter"
accuracies=[]



if not os.path.exists(Tosave_Folder):
    os.makedirs(Tosave_Folder)


def is_File(fileNameInput):
    """
    This function checks if the file exists.
    Input - FileName
    Output - raises Exception if couldnt fine it
    """
    if(not os.path.isfile(fileNameInput)):
        raise ValueError("You must provide a valid fileName as parameter")
    else:
        return True
    
    
    
    

def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    """

    print(cm)

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black", fontsize='large')

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    




def plotting(dataset):
    """
    This function does some feature inspection feature selection.
    It's coded out as the output is shown in ppt file and not needed now.
    """
    '''
    # --Maximum amount of remaining outstanding principal for total amount funded
    # --interest rate for most Defaulters

    fig, ax = plt.subplots(figsize=(5,5))
    ax.set_xlabel('interest_rate')
    ax.set_ylabel('out_prncp')
    dataset.groupby(['interest_rate','loan_status']).count()['out_prncp'].unstack().plot(kind='bar',ax=ax,)
    plt.show()
    # --For term, max acc_now_delinq having loan status default.
    fig, ax = plt.subplots(figsize=(5,5))
    ax.set_xlabel('term')
    ax.set_ylabel('acc_now_delinq')
    dataset.groupby(['term','loan_status']).count()['acc_now_delinq'].unstack().plot(kind='bar',ax=ax)
    plt.show()
    '''






def Read_Data():        
    """
    This function reads data from 3 input files
    output - Dataset with filtered columns
    """
    #---- Reading file in data frame
    fileNameInput=["Loan Classification Information.csv", "Borrower Information.csv","Payment.csv"]

    data_loan = pd.read_csv("../"+str(fileNameInput[0]),header='infer', sep=',', encoding="utf-8", error_bad_lines=False ,  doublequote=True, low_memory=False)    
    data_borrow = pd.read_csv("../"+str(fileNameInput[1]),header='infer', sep=',', encoding="utf-8",error_bad_lines=False ,  doublequote=True, low_memory=False)
    data_payments = pd.read_csv("../"+str(fileNameInput[2]),header='infer', sep=',', encoding="utf-8",error_bad_lines=False ,  doublequote=True,low_memory=False)
   
    
    # Filter needed coloumns
    data_loan_subset = data_loan[["funded_amnt","funded_amnt_inv", "int_rate", "loan_amnt",
                                              "loan_status","pymnt_plan","term","sub_grade"]]
    data_borrow_subset = data_borrow[["annual_inc","emp_length",
                                           "open_acc","pub_rec","total_acc"]]

    data_payments_subset = data_payments[["last_pymnt_amnt","revol_bal","total_pymnt","total_pymnt_inv",
                                      "total_rec_late_fee" ]]
    
    # Merging all selected features
    data_subset=pd.concat([data_loan_subset,data_borrow_subset, data_payments_subset], axis=1)
    print("Dataframe with filtered columns - shape is: ",np.shape(data_subset))
    return data_subset



def Under_Sampling(data_subset):
    """ 
    This function deals with data imbalance.
    inuput - Takes a dataset with filtered columns as input.
    output - Balanced target classes dataset, and whole dataset (all Fully paid rows & Default) 
    """

    # keep only fully paid and Default data
    fraud_indices =data_subset.index[data_subset['loan_status'].isin(['Default'])].tolist()
    normal_indices =data_subset.index[data_subset['loan_status'].isin(['Fully Paid'])].tolist()

    full_data_indices = np.concatenate([fraud_indices, normal_indices])
    whole_data = data_subset.iloc[full_data_indices,:]
    
    count_Default = np.count_nonzero(fraud_indices)
#    random_normal_indices = np.random.choice(normal_indices, count_Default+500, replace = False)

    normal_indices = shuffle(np.array(normal_indices), random_state=seed)
    random_normal_indices = normal_indices[:count_Default]
    random_normal_indices = np.array(random_normal_indices)
    
    under_sample_indices = np.concatenate([fraud_indices,random_normal_indices])

    under_sample_data = data_subset.iloc[under_sample_indices,:]
    
    
  

 
    
#    print (len(under_sample_data['loan_status'=='Default']))
    print (under_sample_data.head(5))
    print("Normal indices",len(random_normal_indices))
    print("fraud indices in original data",len(fraud_indices))
    print("Pruned dataframe shape: ",np.shape(under_sample_data)) 
    print("fraud count in undersample data",np.count_nonzero(under_sample_data['loan_status']=='Default'))
    print("Normal count in undersample data",np.count_nonzero(under_sample_data['loan_status']=='Fully Paid'))
    
    
    return under_sample_data, whole_data



    
def Data_Preprocessing(under_sample_data):
    """
    This function does categorization and transformation on balanced classes dataset
    input - Balanced classes dataset
    output - Balanced classes dataset after modifying its attributes
    """
    dataframe = under_sample_data.copy()
    print("Cateogrizing interest rate ...")    
    dataframe['interest_rate']='0'
    #dataframe.loc[dataframe['int_rate']< 6,'interest_rate']="Below6"
    dataframe.loc[(dataframe['int_rate']> 6) & (dataframe['int_rate']<= 10),'interest_rate']="6To10"
    dataframe.loc[(dataframe['int_rate']> 10) & (dataframe['int_rate']<= 15),'interest_rate']="10To15"
    dataframe.loc[(dataframe['int_rate']> 15) & (dataframe['int_rate']<= 20),'interest_rate']="15To20"
    dataframe.loc[(dataframe['int_rate']> 20) & (dataframe['int_rate']<= 25),'interest_rate']="20To25"
    dataframe.loc[(dataframe['int_rate']> 25) & (dataframe['int_rate']<= 30),'interest_rate']="25To30"
    dataframe.drop(['int_rate'],axis=1, inplace=True)

    print("Cateogrizing annual_income ...")  
    dataframe['annual_income']='0'
    dataframe.loc[(dataframe['annual_inc']> 0) & (dataframe['annual_inc']<= 30000),'annual_income']="0To30000"
    dataframe.loc[(dataframe['annual_inc']> 30000) & (dataframe['annual_inc']<= 60000),'annual_income']="30000To60000"
    dataframe.loc[(dataframe['annual_inc']> 60000) & (dataframe['annual_inc']<= 90000),'annual_income']="60000To90000"
    dataframe.loc[(dataframe['annual_inc']> 90000) & (dataframe['annual_inc']<= 100000),'annual_income']="90000To100000"
    dataframe.loc[(dataframe['annual_inc']> 100000) & (dataframe['annual_inc']<= 300000),'annual_income']="100000To300000"
    dataframe.loc[(dataframe['annual_inc']> 300000) & (dataframe['annual_inc']<= 600000),'annual_income']="300000To600000"
    dataframe.loc[(dataframe['annual_inc']> 600000) ,'annual_income']="GreaterThan600000"
    dataframe.drop(['annual_inc'],axis=1, inplace=True)
    print (dataframe.columns.tolist())
    print (dataframe.head(5))
    

    print("Cateogrizing openAccount ...") 
    dataframe['openAccount']='0'
    dataframe.loc[(dataframe['open_acc']> 0) & (dataframe['open_acc']<= 10),'openAccount']="0To10"
    dataframe.loc[(dataframe['open_acc']> 10) & (dataframe['open_acc']<= 20),'openAccount']="10To20"
    dataframe.loc[(dataframe['open_acc']> 20) & (dataframe['open_acc']<= 30),'openAccount']="20To30"
    dataframe.loc[(dataframe['open_acc']> 30) & (dataframe['open_acc']<= 40),'openAccount']="30To40"
    dataframe.loc[(dataframe['open_acc']> 40) & (dataframe['open_acc']<= 50),'openAccount']="40To50"
    dataframe.loc[(dataframe['open_acc']> 50) & (dataframe['open_acc']<= 60),'openAccount']="50To60"
    dataframe.loc[(dataframe['open_acc']> 60) & (dataframe['open_acc']<= 70),'openAccount']="60To70"
    dataframe.loc[(dataframe['open_acc']> 70) ,'openAccount']="GreaterThan70"
    dataframe.drop(['open_acc'],axis=1, inplace=True)
    print (dataframe.columns.tolist())
    print (dataframe.head(5))



    print("Cateogrizing derog_public_record ...") 
    dataframe['derog_public_record']='0'
    dataframe.loc[(dataframe['pub_rec']> 0) & (dataframe['pub_rec']<= 5),'derog_public_record']="0To5"
    dataframe.loc[(dataframe['pub_rec']> 5) & (dataframe['pub_rec']<= 10),'derog_public_record']="5To10"
    dataframe.loc[(dataframe['pub_rec']> 10) & (dataframe['pub_rec']<= 15),'derog_public_record']="10To15"
    dataframe.loc[(dataframe['pub_rec']> 15) & (dataframe['pub_rec']<= 20),'derog_public_record']="15To20"
    dataframe.loc[(dataframe['pub_rec']> 20) & (dataframe['pub_rec']<= 25),'derog_public_record']="20To25"
    dataframe.loc[(dataframe['pub_rec']> 25) & (dataframe['pub_rec']<= 30),'derog_public_record']="25To30"
    dataframe.loc[(dataframe['pub_rec']> 30) ,'derog_public_record']="GreaterThan70"
    dataframe.drop(['pub_rec'],axis=1, inplace=True)
    print (dataframe.columns.tolist())
    print (dataframe.head(5))


    print("Cateogrizing revolving_balance ...")
    dataframe['revolving_balance']='0'
    dataframe.loc[(dataframe['revol_bal']> 0) & (dataframe['revol_bal']<= 1000),'revolving_balance']="0To1000"
    dataframe.loc[(dataframe['revol_bal']> 1000) & (dataframe['revol_bal']<= 2000),'revolving_balance']="1000To2000"
    dataframe.loc[(dataframe['revol_bal']> 2000) & (dataframe['revol_bal']<= 3000),'revolving_balance']="2000To3000"
    dataframe.loc[(dataframe['revol_bal']> 3000) & (dataframe['revol_bal']<= 4000),'revolving_balance']="3000To4000"
    dataframe.loc[(dataframe['revol_bal']> 4000) & (dataframe['revol_bal']<= 5000),'revolving_balance']="4000To5000"
    dataframe.loc[(dataframe['revol_bal']> 5000) & (dataframe['revol_bal']<= 6000),'revolving_balance']="5000To6000"
    dataframe.loc[(dataframe['revol_bal']> 6000) & (dataframe['revol_bal']<= 7000),'revolving_balance']="6000To7000"
    dataframe.loc[(dataframe['revol_bal']> 7000) & (dataframe['revol_bal']<= 8000),'revolving_balance']="7000To8000"
    dataframe.loc[(dataframe['revol_bal']> 8000) & (dataframe['revol_bal']<= 9000),'revolving_balance']="8000To9000"
    dataframe.loc[(dataframe['revol_bal']> 9000) ,'revolving_balance']="GreaterThan9000"
    dataframe.drop(['revol_bal'],axis=1, inplace=True)
    print (dataframe.columns.tolist())
    print (dataframe.head(5))


    print("Cateogrizing funded_amount ...")
    dataframe['funded_amount']='0'
    dataframe.loc[(dataframe['funded_amnt']> 0) & (dataframe['funded_amnt']<= 5000),'funded_amount']="0To5000"
    dataframe.loc[(dataframe['funded_amnt']> 5000) & (dataframe['funded_amnt']<= 10000),'funded_amount']="5000To10000"
    dataframe.loc[(dataframe['funded_amnt']> 10000) & (dataframe['funded_amnt']<= 15000),'funded_amount']="10000To15000"
    dataframe.loc[(dataframe['funded_amnt']> 15000) & (dataframe['funded_amnt']<= 20000),'funded_amount']="15000To20000"
    dataframe.loc[(dataframe['funded_amnt']> 20000) & (dataframe['funded_amnt']<= 25000),'funded_amount']="20000To25000"
    dataframe.loc[(dataframe['funded_amnt']> 25000) & (dataframe['funded_amnt']<= 30000),'funded_amount']="25000To30000"
    dataframe.loc[dataframe['funded_amnt']> 30000, 'funded_amount']="GreaterThan30000"
    dataframe.drop(['funded_amnt'],axis=1, inplace=True)
    dataframe['emp_length'].fillna('0',inplace=True)
    dataframe['emp_length'].replace(to_replace=' years', value='', regex=True, inplace=True)
    dataframe['emp_length'].replace(to_replace='\+ years', value='', regex=True, inplace=True)
    dataframe['emp_length'].replace(to_replace='< 1 year', value='0', regex=True, inplace=True)

    return dataframe


def RoundOff_Data(dataframe):
    """
    This function does rounding off of columns having nearly same decimal values
    input - balanced classes dataset
    output - balanced classes dataset with some columns rounded off
    """
    
    print("\n\nRounding Off...\n")    
    temp_dataframe = pd.DataFrame()
    cols_ToRoundOff=['total_rec_late_fee', 'total_pymnt_inv', 'total_pymnt','funded_amnt_inv','last_pymnt_amnt' ]
    
    
    for colname in cols_ToRoundOff:
        temp_dataframe[colname] = dataframe[colname].round()
        dataframe.drop([colname],axis=1, inplace=True)

    dataset = pd.concat([dataframe, temp_dataframe], axis=1)
    print(np.shape(dataset))
    

    return dataset






def Encoding(dataset):
    """
    This function does label encoding & One-Hot-Encoding of all dataset and converts it into encoded dataframe
    input - dataset with balanced classes and mixed attributes
    output - dataset with balanced classes in encoded form as numeric
    """
    
    print("Label Encoding ... ")
    Path(Tosave_Folder+"/"+'encoding').mkdir( exist_ok=True) 
    
    
    
    # Label encoding for each feature
    leX = LabelEncoder()
    for colName in dataset.columns.tolist():
            dataset[colName] = leX.fit_transform((dataset[colName].astype(str)))
            np.save(Tosave_Folder+"/encoding/"+colName+".npy",leX.classes_)
            #y_LabEnc= dataset['Default'].copy()

    #Separating target class
    y_labEnc= dataset['loan_status'].copy()
    dataset.drop(['loan_status'],inplace=True, axis=1)
    
    #Separate feature matrix
    X_labEnc = dataset.iloc[:,:].copy()
    
    # One-Hot-Encoding
    ohe_X = OneHotEncoder()
    X_OHE = ohe_X.fit_transform(X_labEnc)
    print("\nOne Hot Encoded Data",np.shape(X_OHE))
    
    #need first 2 for neural networks & X_labEnc for XGBS
    return X_OHE, y_labEnc, X_labEnc





def Rescaling(X_Train, X_Test):
  """
  This function does feature scaling for Training set and Testing set
  input - Spllitted and one hot encoded Training set and Test set
  output - Rescaled Training and Test set
  """
  norm = MaxAbsScaler()
  X_Train = norm.fit_transform(X_Train)
  X_Test  = norm.transform(X_Test)
  return X_Train, X_Test




def Inv_Enc(y_P, y_Te):
    """
    This function returns the actual values of the target classes after the prediction is done
    on the Test set.
    Input - Encoded True values and Encoded Predicted values of the Target Class
    Output- De-Encoded True values and De-Encoded predicted values of the Target Class.
    """

    Le_Decode= LabelEncoder()
    fileToRead = Tosave_Folder+"/encoding/"+str(targetName)+".npy"
    if is_File(fileToRead) :
            Le_Decode.classes_ = np.load(fileToRead)
            le_classes = Le_Decode.classes_.tolist()
            Le_Decode.classes = le_classes

            y_P_Decoded = Le_Decode.inverse_transform(y_P)
            y_T_Decoded = Le_Decode.inverse_transform(y_Te)
            
            return y_P_Decoded, y_T_Decoded
    else:
        print("numpy file for Target Class Not Found !!")
            
    



def Performace_Metrics(y_Pred, y_Test, y_Train, classifier_fold_info):
        """
        This function computes the classification metrics on the balanced classes dataset
        and writes the metrics for each fold into the Output.txt file.
        These metrics mainly incude Accuracy, Precision, Recall, F1-Score.
        Input - Label Encoded & OneHotEncoded (Predictes values of Test data, True values of Test data,
                                               True values of Training data), dictionary having classifer info 
                                               to be dumped in output.txt file
        Output - None
        """
        # decode labels
        y_Train_Decoded, y_Test_Decoded = Inv_Enc(y_Train, y_Test)
        y_Pred_Decoded, y_Test_Decoded = Inv_Enc(y_Pred, y_Test)    
        
        # plot confusion matrix
        cnf_matrix = confusion_matrix(y_Test_Decoded, y_Pred_Decoded)
        plt.figure()
        plot_confusion_matrix(cnf_matrix, classes=["Default","Fully Paid"], title='Confusion matrix')
        plt.savefig(Tosave_Folder+"/"+classifier_fold_info['classifier']+"_"+str(classifier_fold_info['splitCount'])+"ConfusionMatx")
        plt.close()
        
        # calculate performance metric
        correctTotal=0
        for true,pred in zip(y_Test, y_Pred):
            if true == pred:
                correctTotal = correctTotal +1
        acc = np.round((float (correctTotal)/int(len(y_Test)))*100,2)
        classifier_fold_info['Accuracy']=str(acc)
        classifier_fold_info["metrics"] = classification_report(y_Test, y_Pred,target_names=["Default","Fully Paid"])
        with open(Tosave_Folder+"/Output.json","a") as fw:
            json.dump(classifier_fold_info, fw,indent=2)
        print("\n\nAccuracy: ",acc)
        accuracies.append(acc)
    
    
    
def Create_NN(X_Train, X_TrainTest, y_Train, y_TrainTest, X_Test, y_Test, classifier_fold_info):
    """
    This function will creat Neural Network model utilizing keras sequential model with Tensorflow backend.
    It will also train model on Training data and validate it on Validation data.
    It also plots the learning behaviour of the algorithm by plotting accuracy achieved on y-axis and
    epochs run on x-axis.
    Input - Splitted Training set, Validation set, Test set, dictionary_storing_classifier_info
    Output- predicted classes for Test set.
    """
    
    
    #------------------------
    # Define neural network
    #-----------------------
    model = Sequential()
    optimizer = Adam(lr=0.01)
    model.add(Dense(200,activation ='relu',input_dim=np.shape(X_Train)[1]))
#   model.add(Dropout(0.2))
    model.add(Dense(units=1, activation = "sigmoid"))
    model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])
    

    callback=[EarlyStopping(monitor='val_loss',patience=5, verbose=1),
              ModelCheckpoint(Tosave_Folder+"/weights-improvement-{epoch:02d}-{val_acc:.2f}.hdf5", monitor='val_acc', 
                              verbose=1, save_best_only=True,mode='auto',save_weights_only=False)]
              
              
    print("\nTraining Data shape is:",np.shape(X_Train))
    

    start_time= datetime.now()
    history=model.fit(X_Train, y_Train, validation_data=(X_TrainTest,y_TrainTest), epochs=10, callbacks=callback,
                      batch_size = 32)

    classifier_fold_info["Time taken"] = str(datetime.now()-start_time)    



    # -- Plot
    # summarize history for accuracy
    plt.figure()
    plt.plot( history.history['acc'])
    plt.plot( history.history['val_acc'])
    plt.title("NN_Fold"+str(classifier_fold_info['splitCount'])+"Training_Validation_Accuracy")
    plt.ylabel('accuracy')
    plt.xlabel('epoch')
    plt.ylim(ymin=0)
    plt.legend(['train', 'test'], loc='lower left')
    plt.savefig(Tosave_Folder+"/Fold"+str(classifier_fold_info['splitCount'])+"TrainAccuracy")
    plt.close()
    # summarize history for loss
    plt.figure()
    plt.plot( history.history['loss'])
    plt.plot(history.history['val_loss'])
    plt.title("NN_Fold"+str(classifier_fold_info['splitCount'])+"TrainLoss")
    plt.ylabel('loss')
    plt.xlabel('epoch')
    plt.ylim(ymin=0)
    plt.legend(['train', 'test'], loc='upper left')
    plt.savefig(Tosave_Folder+"/Fold"+str(classifier_fold_info['splitCount'])+"TrainLoss")
    plt.close()
    
    
    # do predictions
    y_Pred =model.predict_classes(X_Test)
    
    return y_Pred




    
def Neural_Network(X_OHE,y_labEnc):    
    """
    This function implements the neural network algorithm. It does the stratified splitting of encoded data
    into Training, Validation and Test sets. In order to do cross-validation this function repeats the process
    of splitting 4 times and store the fold count in the dictionary. After splitting, it calls rescaling function
    to scale the features and then calls Create_NN for model creating, training and prediction on splitted dataset.
    Input - One-Hot-Encoded input sparse Matrix, Label Encoded Target Class
    Output - Computes the average of 4 fold cross-validation of neural network performance.
    """
    
    #-----------------------------------------------------------------
    #           Neural Network 
    #-----------------------------------------------------------------
    
    classifier_fold_info ={}
    classifier_fold_info['splitCount']=1
    classifier_fold_info['classifier']="NeuralNetwork"

    # Stratified classes will generate balanced classes.
    kf = StratifiedKFold(n_splits=4, random_state=seed)

    for train, test in kf.split(X_OHE,y_labEnc):
        # X = np.array(X_OHE)
        y_labEnc = np.array(y_labEnc)
        X_Train, X_Test, y_Train, y_Test = X_OHE[train], X_OHE[test], y_labEnc[train], y_labEnc[test]
        
        X_Train, X_Test = Rescaling(X_Train, X_Test)
        X_Train, X_TrainTest, y_Train, y_TrainTest = train_test_split(X_Train,y_Train, test_size=0.33, random_state=seed)
        
        # --Model--
        y_Pred = Create_NN(X_Train, X_TrainTest, y_Train, y_TrainTest, X_Test, y_Test, classifier_fold_info)
        
        Performace_Metrics(y_Pred, y_Test, y_Train, classifier_fold_info)
        classifier_fold_info['splitCount']=classifier_fold_info['splitCount']+1
    

    
    # Result    
    average_accuracy =float(np.sum(accuracies[-4:]) /4)
    print("\n\nNeural Network  4-fold cross validation", average_accuracy)

    
 


    
def Xgboost(X_labEnc,y_labEnc):
    """
    This function implements the Xgboost algorithm. It does the stratified splitting of encoded data
    into Training, Validation and Test sets. In order to do cross-validation this function repeats the process
    of splitting 4 times and store the fold count in the dictionary. After splitting, to scale the features 
    and then trains the model on training data and prediction on splitted dataset.
    Input - Label Encoded Input array, Label Encoded Target Class
    Output - Computes the average of 4 fold cross-validation of Xgboost performance.
    """    

    classifier_fold_info ={}
    classifier_fold_info['splitCount']=1
    classifier_fold_info['classifier']="Xgboost_Stratified"
    
    #------------------------------------------------
    #           XGBoost  - stratified sampling
    #------------------------------------------------
    kf = StratifiedKFold(n_splits=4, random_state=seed)
    for train, test in kf.split(X_labEnc,y_labEnc):

        X = np.array(X_labEnc)
        y = np.array(y_labEnc)

        # split train,test
        X_Train, X_Test, y_Train, y_Test = X[train], X[test], y[train], y[test]
        X_Train, X_TrainTest, y_Train, y_TrainTest = train_test_split(X_Train,y_Train, test_size=0.33, random_state=1)
      
        # define model
        clf = xgboost.sklearn.XGBClassifier(objective="binary:logistic",learning_rate=0.01,seed=1,
                                            max_depth=20, gamma=10, n_estimators=300)
        # fit model
        start_time= datetime.now()        
        clf.fit(X_Train, y_Train, early_stopping_rounds=5,eval_set=[(X_TrainTest,y_TrainTest)], eval_metric="auc", verbose=True)
        classifier_fold_info["Time taken"] = str(datetime.now()-start_time)    

        y_Pred = clf.predict(X_Test)        
        predictions = [round(value) for value in y_Pred]
        y_Pred = predictions
        
        # Decode Labels in predictions
        Performace_Metrics(y_Pred, y_Test, y_Train, classifier_fold_info)
        classifier_fold_info['splitCount']=classifier_fold_info['splitCount']+1
    
    # result    
    average_accuracy =float(np.sum(accuracies[-4:]) /4)
    print("\n\n XGBoost 4-fold cross validation", average_accuracy)  
    
    
    
    
def Xgboost_Unstratified(X_labEnc,y_labEnc): 
    """
    This function implements the Xgboost algorithm. It does the un-stratified splitting of encoded data
    into Training, Validation and Test sets. In order to do cross-validation this function repeats the process
    of splitting 4 times and store the fold count in the dictionary. After splitting, to scale the features 
    and then trains the model on training data and prediction on splitted dataset.
    Input - Label Encoded Input array, Label Encoded Target Class
    Output - Computes the average of 4 fold cross-validation of Xgboost performance.
    """     

    classifier_fold_info ={}
    classifier_fold_info['splitCount']=1
    classifier_fold_info['classifier']="Xgboost_Unstratified"
    #------------------------------------------------
    #           XGBoost  -  non-stratified sampling
    #------------------------------------------------
    for splits in range(4):

        X_Train, X_Test, y_Train, y_Test = train_test_split(X_labEnc,y_labEnc,test_size=0.33)
        X_Train, X_TrainTest, y_Train, y_TrainTest = train_test_split(X_Train,y_Train, test_size=0.33, 
                                                                      random_state=seed)
      
        # define model
        clf = xgboost.sklearn.XGBClassifier(objective="binary:logistic",learning_rate=0.01,seed=seed,
                                            max_depth=20, gamma=10, n_estimators=300)
        # fit model
        start_time= datetime.now()  
        clf.fit(X_Train, y_Train, early_stopping_rounds=5,eval_set=[(X_TrainTest,y_TrainTest)], eval_metric="auc", 
                                                                    verbose=True)
        classifier_fold_info["Time taken"] = str(datetime.now()-start_time)    

            
        y_Pred = clf.predict(X_Test)        
        predictions = [round(value) for value in y_Pred]
        y_Pred = predictions
        
        # Decode Labels in predictions
        Performace_Metrics(y_Pred, y_Test, y_Train, classifier_fold_info)
        classifier_fold_info['splitCount']=classifier_fold_info['splitCount']+1
    
    # result    
    print(accuracies)
    average_accuracy =(np.sum(accuracies[-4:])/4)
    print("\n\nXGBoost unstratified 4-fold cross validation", average_accuracy)    







def Logistic_Regression(X_OHE,y_labEnc):
    """
    This function implements the Logistic Regression algorithm. It does the stratified splitting of encoded data
    into Training, Validation and Test sets. In order to do cross-validation this function repeats the process
    of splitting 4 times and store the fold count in the dictionary. After splitting, to scale the features 
    it call rescaling and then trains the model on training data and prediction on splitted dataset.
    Input - One-Hot-Encoded Input array, Label Encoded Target Class
    Output - Computes the average of 4 fold cross-validation of Logistic Regression performance    
    """
    classifier_fold_info ={}
    classifier_fold_info['splitCount']=1
    classifier_fold_info['classifier']="Logistic Regression"
    logreg = LogisticRegression(penalty='l2', tol=0.0001, random_state=seed,solver='liblinear', max_iter=50,verbose=1)
                                # liblinear is a good choice for small datasets


    
    kf = StratifiedKFold(n_splits=4, random_state=seed)

    for train, test in kf.split(X_OHE,y_labEnc):
        X = np.array(X_OHE)
        y_labEnc = np.array(y_labEnc)
        X_Train, X_Test, y_Train, y_Test = X_OHE[train], X_OHE[test], y_labEnc[train], y_labEnc[test]
        
        X_Train, X_Test = Rescaling(X_Train, X_Test)
        X_Train, X_TrainTest, y_Train, y_TrainTest = train_test_split(X_Train,y_Train, test_size=0.33, random_state=seed)
        start_time= datetime.now()  
        

        logreg.fit(X_Train, y_Train)
        classifier_fold_info["Time taken"] = str(datetime.now()-start_time) 

        y_Pred = logreg.predict(X_Test)


        # Decode Labels in predictions
        Performace_Metrics(y_Pred, y_Test, y_Train, classifier_fold_info)
        classifier_fold_info['splitCount'] =  classifier_fold_info['splitCount']+1
    
    # result    
    print(accuracies)
    average_accuracy =(np.sum(accuracies[:])/4)
    print("\n\nLogistic Regression 4-fold cross validation", average_accuracy)    

In [10]:
data_subset =Read_Data()

Dataframe with filtered columns - shape is:  (887379, 18)


In [12]:
under_sample_data, whole_data = Under_Sampling(data_subset)

       funded_amnt  funded_amnt_inv  int_rate  loan_amnt loan_status  \
318          18000     17975.000000     17.27      18000     Default   
7620         21250     21003.604048     14.27      21250     Default   
11759         5600      5600.000000     15.99       5600     Default   
13439        15975     15975.000000     20.99      15975     Default   
13856         5000      5000.000000     15.99       5000     Default   

      pymnt_plan        term sub_grade  annual_inc emp_length  open_acc  \
318            n   60 months        D3     62000.0     1 year       8.0   
7620           n   60 months        C2     36000.0    5 years      20.0   
11759          n   60 months        D2     52416.0    8 years       8.0   
13439          n   60 months        E5    225000.0    8 years      10.0   
13856          n   60 months        D2     65004.0  10+ years      15.0   

       pub_rec  total_acc  last_pymnt_amnt  revol_bal  total_pymnt  \
318        0.0       30.0           449.97    

In [13]:
dataframe = Data_Preprocessing(under_sample_data)

Cateogrizing interest rate ...
Cateogrizing annual_income ...
['funded_amnt', 'funded_amnt_inv', 'loan_amnt', 'loan_status', 'pymnt_plan', 'term', 'sub_grade', 'emp_length', 'open_acc', 'pub_rec', 'total_acc', 'last_pymnt_amnt', 'revol_bal', 'total_pymnt', 'total_pymnt_inv', 'total_rec_late_fee', 'interest_rate', 'annual_income']
       funded_amnt  funded_amnt_inv  loan_amnt loan_status pymnt_plan  \
318          18000     17975.000000      18000     Default          n   
7620         21250     21003.604048      21250     Default          n   
11759         5600      5600.000000       5600     Default          n   
13439        15975     15975.000000      15975     Default          n   
13856         5000      5000.000000       5000     Default          n   

             term sub_grade emp_length  open_acc  pub_rec  total_acc  \
318     60 months        D3     1 year       8.0      0.0       30.0   
7620    60 months        C2    5 years      20.0      0.0       25.0   
11759   60 mo

In [14]:
dataset = RoundOff_Data(dataframe)



Rounding Off...

(2438, 18)


In [15]:
X_OHE, y_labEnc, X_labEnc = Encoding(dataset)

Label Encoding ... 

One Hot Encoded Data (2438, 7514)


In [16]:
Logistic_Regression(X_OHE,y_labEnc)

[LibLinear][[197 108]
 [107 198]]


Accuracy:  64.75
[LibLinear][[218  87]
 [109 196]]


Accuracy:  67.87
[LibLinear][[191 114]
 [107 198]]


Accuracy:  63.77
[LibLinear][[210  94]
 [103 201]]


Accuracy:  67.6
[64.75, 67.870000000000005, 63.770000000000003, 67.599999999999994]


Logistic Regression 4-fold cross validation 65.9975


In [17]:
Neural_Network(X_OHE,y_labEnc)


Training Data shape is: (1224, 7514)
Train on 1224 samples, validate on 604 samples
Epoch 1/10

Epoch 00001: val_acc improved from -inf to 0.65563, saving model to Credit_Defaulter/weights-improvement-01-0.66.hdf5
Epoch 2/10

Epoch 00002: val_acc improved from 0.65563 to 0.68874, saving model to Credit_Defaulter/weights-improvement-02-0.69.hdf5
Epoch 3/10

Epoch 00003: val_acc did not improve from 0.68874
Epoch 4/10

Epoch 00004: val_acc improved from 0.68874 to 0.70199, saving model to Credit_Defaulter/weights-improvement-04-0.70.hdf5
Epoch 5/10

Epoch 00005: val_acc did not improve from 0.70199
Epoch 6/10

Epoch 00006: val_acc did not improve from 0.70199
Epoch 00006: early stopping
[[197 108]
 [121 184]]


Accuracy:  62.46

Training Data shape is: (1224, 7514)
Train on 1224 samples, validate on 604 samples
Epoch 1/10

Epoch 00001: val_acc improved from -inf to 0.65066, saving model to Credit_Defaulter/weights-improvement-01-0.65.hdf5
Epoch 2/10

Epoch 00002: val_acc did not improve

In [18]:
Xgboost(X_labEnc,y_labEnc)

[0]	validation_0-auc:0.891417
Will train until validation_0-auc hasn't improved in 5 rounds.
[1]	validation_0-auc:0.91194
[2]	validation_0-auc:0.911173
[3]	validation_0-auc:0.911929
[4]	validation_0-auc:0.911885
[5]	validation_0-auc:0.912247
[6]	validation_0-auc:0.911874
[7]	validation_0-auc:0.912434
[8]	validation_0-auc:0.912488
[9]	validation_0-auc:0.913103
[10]	validation_0-auc:0.919233
[11]	validation_0-auc:0.918695
[12]	validation_0-auc:0.918202
[13]	validation_0-auc:0.918279
[14]	validation_0-auc:0.919748
[15]	validation_0-auc:0.919562
[16]	validation_0-auc:0.919721
[17]	validation_0-auc:0.920357
[18]	validation_0-auc:0.920872
[19]	validation_0-auc:0.91994
[20]	validation_0-auc:0.920905
[21]	validation_0-auc:0.920488
[22]	validation_0-auc:0.922512
[23]	validation_0-auc:0.922062
[24]	validation_0-auc:0.922062
[25]	validation_0-auc:0.92238
[26]	validation_0-auc:0.923077
[27]	validation_0-auc:0.922846
[28]	validation_0-auc:0.922956
[29]	validation_0-auc:0.923449
[30]	validation_0-au

In [19]:
Xgboost_Unstratified(X_labEnc,y_labEnc)

[0]	validation_0-auc:0.915797
Will train until validation_0-auc hasn't improved in 5 rounds.
[1]	validation_0-auc:0.915797
[2]	validation_0-auc:0.915797
[3]	validation_0-auc:0.915797
[4]	validation_0-auc:0.915797
[5]	validation_0-auc:0.929955
[6]	validation_0-auc:0.930891
[7]	validation_0-auc:0.930919
[8]	validation_0-auc:0.935098
[9]	validation_0-auc:0.937316
[10]	validation_0-auc:0.938087
[11]	validation_0-auc:0.937825
[12]	validation_0-auc:0.937502
[13]	validation_0-auc:0.938865
[14]	validation_0-auc:0.939375
[15]	validation_0-auc:0.937901
[16]	validation_0-auc:0.937942
[17]	validation_0-auc:0.936441
[18]	validation_0-auc:0.938004
[19]	validation_0-auc:0.936731
Stopping. Best iteration:
[14]	validation_0-auc:0.939375

[[317  80]
 [ 39 369]]


Accuracy:  85.22
[0]	validation_0-auc:0.916281
Will train until validation_0-auc hasn't improved in 5 rounds.
[1]	validation_0-auc:0.92333
[2]	validation_0-auc:0.92333
[3]	validation_0-auc:0.9263
[4]	validation_0-auc:0.926218
[5]	validation_0-a