In [None]:
#Importing packages that may be used later in the program
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sn
import warnings
import xgboost
import math
import lightgbm
import pandasql as psql
warnings.filterwarnings("ignore")

In [None]:
#Reading the dataset and making a backup copy of it
CD=pd.read_csv("../input/dataset4/credit_train.csv",header=0)
CD_BK1=CD.copy()
CD.head()

In [None]:
#Renaming the columns with proper names
CD=CD.rename(columns={
    'Loan ID':'Loan_ID',
    'Customer ID':'Customer_ID',
    'Loan Status':'Loan_Status',
    'Current Loan Amount':'Current_Loan_Amount',
    'Credit Score':'Credit_Score',
    'Annual Income':'Annual_Income',
    'Years in current job':'Years_in_current_job',
    'Home Ownership':'Home_Ownership',
    'Monthly Debt':'Monthly_Debt',
    'Years of Credit History':'Years_of_Credit_History',
    'Months since last delinquent':'Months_since_last_delinquent',
    'Number of Open Accounts':'Number_of_Open_Accounts',
    'Number of Credit Problems':'Number_of_Credit_Problems',
    'Current Credit Balance':'Current_Credit_Balance',
    'Maximum Open Credit':'Maximum_Open_Credit',
    'Tax Liens':'Tax_Liens'
})

In [None]:
#Checking the shape of the dataset
print(CD.shape)

In [None]:
print('HEATMAP REPRESENTING THE CORRELATION AMONG DIFFERENT COLUMNS BASED ON THIER CORRELATION COEFFIECIENT VALUE')
sn.heatmap(CD.corr(),cmap='Greys')

In [None]:
print('\nPercentage of null values in each of the columns(drop column if >80%) along with their BAR PLOT ')
l=[]
lx=[]
for i,col in enumerate(CD.columns):
    print('Column ',i,' : ',(CD[col].isnull().sum()/len(CD))*100,' %')
    l.append((CD[col].isnull().sum()/len(CD))*100)
    x=str(i)
    x='C'+x
    lx.append(x)
plt.figure()
plt.bar(lx,l)
plt.show()

In [None]:
#Number of rows with Null values in each column
CD.isnull().sum(axis=0)

In [None]:
#Number of columns with Null values in each row (classified) along with their BAR PLOT
temp=CD.isnull().sum(axis=1).value_counts()
print(temp)
l1=list(temp.index)
l2=list(temp.values)
plt.figure()
plt.bar(l1,l2)
plt.show()

In [None]:
#Dropping the columns which have no effect on the target variable
CD.drop(['Loan_ID','Customer_ID'],axis=1,inplace=True)

In [None]:
CD

In [None]:
#Dropping the 514 rows with all the columns as Null values as they would drag down the accuracy
CD.dropna(axis=0,how='all',inplace=True)

In [None]:
#Reseting the index values after dropping 514 rows
CD=CD.reset_index(drop=True)
#making a backup for the new dataset
CD_BK2=CD.copy()

In [None]:
#Checking if the Null values are dropped or not
temp=CD.isnull().sum(axis=1).value_counts()
print(temp)
l1=list(temp.index)
l2=list(temp.values)
plt.figure()
plt.bar(l1,l2)
plt.show()

In [None]:
#Creating Simple and KNN Imputer objects to fill the rows with null values
from sklearn.impute import SimpleImputer,KNNImputer
sio=SimpleImputer(strategy='most_frequent')
knnio=KNNImputer(n_neighbors=10)

In [None]:
#Checking the columns which have Null values
CD.isnull().any()

In [None]:
#Filling Null values of the columns Years_in_current_job using Simple Imputer
CD.Years_in_current_job=sio.fit_transform(CD[['Years_in_current_job']])

In [None]:
#Filling the rows with Null values in columns which have them using KNNImputer
CD[['Credit_Score']]=knnio.fit_transform(CD[['Credit_Score']])
CD[['Annual_Income']]=knnio.fit_transform(CD[['Annual_Income']])

In [None]:
CD[['Months_since_last_delinquent']]=knnio.fit_transform(CD[['Months_since_last_delinquent']])

In [None]:
CD[['Maximum_Open_Credit']]=knnio.fit_transform(CD[['Maximum_Open_Credit']])
CD[['Bankruptcies']]=knnio.fit_transform(CD[['Bankruptcies']])
CD[['Tax_Liens']]=knnio.fit_transform(CD[['Tax_Liens']])

In [None]:
CD.info()

In [None]:
#Creating a backup of the updated dataset
CD_BK3=CD.copy()

In [None]:
#Checking if the updated dataset has any Null values
CD.isnull().any()

In [None]:
#Dropping the duplicated rows from the dataset as they have no additional influence on the training of machine
CD=CD.drop_duplicates(keep='last')
#reset the index of dataset and make a copy of it as backup
CD=CD.reset_index(drop=True)
CD_BK4=CD.copy()

In [None]:
CD.shape

In [None]:
#Making the target column with the object datatype (categorical data)into integer by replacing the appropriate values in them with numbers
CD.Loan_Status=CD.Loan_Status.str.replace('Fully Paid','1')
CD.Loan_Status=CD.Loan_Status.str.replace('Charged Off','0')
CD.Loan_Status=CD.Loan_Status.astype(int)

In [None]:
CD.columns[CD.dtypes=='object']

In [None]:
print('STACKED BAR PLOTS REPRESENTING THE PERCENTAGE OF PEOPLE IN EACH GROUP OF EVERY COLUMN PAYING THE LOAN AND BEING CHARGED OFF RESPECTIVELY')
lis=[ 'Term', 'Years_in_current_job', 'Home_Ownership','Purpose']
i=CD.groupby('Loan_Status')
i=i.get_group(1)
l1=[]
l2=[]
l3=[]
l4=[]
lx=[]
nl2=[]
w=0.5
for col in lis:
    l1=[]
    l2=[]
    lx=[]
    nl2=[]
    k=i.groupby(col)
    print(col)
    m=CD.groupby(col)
    t1=CD[col].value_counts()
    temp=CD[col].value_counts().index
    for j in temp:
        lx.append(j)
        n=m.get_group(j)
        l=k.get_group(j)
        print(j,' ',t1[j],' ',(len(l)/len(n))*100,' %')
        l1.append(j)
        l2.append((len(l)/len(n))*100)
        nl2.append(100-((len(l)/len(n))*100))
    l3.append(l1)
    l4.append(l2)
    plt.figure()
    plt.bar(lx,l2,w,label='Paid')
    plt.bar(lx,nl2,w,label='Not Paid')
    plt.legend(loc='upper right')
    plt.show()

In [None]:
#Making the other columns with the object datatype (categorical data)into integer by replacing the appropriate values in them with numbers since most of them are ordinal we do this process manually
CD.Term=CD.Term.str.replace('Short Term','1')
CD.Term=CD.Term.str.replace('Long Term','0')
CD.Term=CD.Term.astype(int)

In [None]:
CD.Years_in_current_job=CD.Years_in_current_job.str.replace('9 years','0')
CD.Years_in_current_job=CD.Years_in_current_job.str.replace('< 1 year','1')
CD.Years_in_current_job=CD.Years_in_current_job.str.replace('7 years','2')
CD.Years_in_current_job=CD.Years_in_current_job.str.replace('6 years','3')
CD.Years_in_current_job=CD.Years_in_current_job.str.replace('1 year','4')
CD.Years_in_current_job=CD.Years_in_current_job.str.replace('8 years','5')

In [None]:
CD.Years_in_current_job=CD.Years_in_current_job.str.replace('+','')

In [None]:
CD.Years_in_current_job=CD.Years_in_current_job.str.replace('10 years','6')

In [None]:
CD.Years_in_current_job=CD.Years_in_current_job.str.replace('4 years','7')
CD.Years_in_current_job=CD.Years_in_current_job.str.replace('5 years','8')
CD.Years_in_current_job=CD.Years_in_current_job.str.replace('2 years','9')
CD.Years_in_current_job=CD.Years_in_current_job.str.replace('3 years','10')
CD.Years_in_current_job=CD.Years_in_current_job.astype(int)

In [None]:
CD.Home_Ownership=CD.Home_Ownership.str.replace('Rent','0')
CD.Home_Ownership=CD.Home_Ownership.str.replace('Own Home','1')
CD.Home_Ownership=CD.Home_Ownership.str.replace('Home Mortgage','2')
CD.Home_Ownership=CD.Home_Ownership.str.replace('HaveMortgage','3')
CD.Home_Ownership=CD.Home_Ownership.astype(int)

In [None]:
CD.Purpose=CD.Purpose.str.replace('renewable_energy','0') 
CD.Purpose=CD.Purpose.str.replace( 'small_business','1')
CD.Purpose=CD.Purpose.str.replace( 'Business Loan','2')
CD.Purpose=CD.Purpose.str.replace( 'vacation','3')
CD.Purpose=CD.Purpose.str.replace('moving','4')
CD.Purpose=CD.Purpose.str.replace( 'Medical Bills','5')
CD.Purpose=CD.Purpose.str.replace('other','6')
CD.Purpose=CD.Purpose.str.replace( 'Debt Consolidation','7')
CD.Purpose=CD.Purpose.str.replace( 'major_purchase','8')
CD.Purpose=CD.Purpose.str.replace('Buy House','9')
CD.Purpose=CD.Purpose.str.replace('Take a Trip','10')
CD.Purpose=CD.Purpose.str.replace( 'Home Improvements','11')
CD.Purpose=CD.Purpose.str.replace('wedding','12')
CD.Purpose=CD.Purpose.str.replace('Other','13')
CD.Purpose=CD.Purpose.str.replace('Educational Expenses','14')
CD.Purpose=CD.Purpose.str.replace('Buy a Car','15')
CD.Purpose=CD.Purpose.astype(int)

In [None]:
CD.info()

In [None]:
#Checking the unique values in each of the columns and their number of occurances
for col in CD.columns:
    print(col)
    print(CD[col].value_counts())

In [None]:
CD.columns

In [None]:
lis=[ 'Current_Loan_Amount',  'Credit_Score',
       'Annual_Income', 'Years_in_current_job', 'Home_Ownership', 'Purpose',
       'Monthly_Debt', 'Years_of_Credit_History',
       'Months_since_last_delinquent', 'Number_of_Open_Accounts',
       'Number_of_Credit_Problems', 'Current_Credit_Balance',
       'Maximum_Open_Credit', 'Bankruptcies', 'Tax_Liens']

In [None]:
print('HISTOGRAMS REPRESENTING THE DISTRIBUTION OF DATA IN ALL THE COLUMNS OF THE TRAINING DATASET')
for col in lis:
    print('Name : ',col)
    print('Mean : ',CD[col].mean())
    print('Stadard Deviation : ',CD[col].std())
    plt.figure()
    plt.hist(CD[col],bins=100)
    plt.show()

In [None]:
#Removing Outliers from each of the columns considered individually for better accuracy of the model
print(col,CD.shape,sep=':')
for col in lis:
    print(col)
    zscore=((CD[col]-CD[col].mean())/CD[col].std())
    CD=CD[ ((zscore > -3.5)  &   (zscore< 3.5)) ]
    print(col,CD.shape,sep=':')

In [None]:
#Dividing the dependent and independent variables' data
x=CD[CD.columns[CD.columns!='Loan_Status']]
y=CD[CD.columns[CD.columns=='Loan_Status']]

In [None]:
print('PIE CHART REPRESENTING THE PERCENTAGE OF DIFFERENT OUTCOMES OF THE TARGET VARIABLE IN THE TRAINING DATA')
lis=y.value_counts()
plt.figure()
plt.pie(lis,startangle=90,labels=[1,0])
plt.show()

In [None]:
y.value_counts()

In [None]:
#Oversampling the training data so that a comparitibly equal number of records exist for each of the outcomes in the dataset
from imblearn.over_sampling import RandomOverSampler
roso=RandomOverSampler(sampling_strategy=0.6)
x_over,y_over=roso.fit_resample(x,y)

In [None]:
print('PIE CHART REPRESENTING THE PERCENTAGE OF DIFFERENT OUTCOMES OF THE TARGET VARIABLE IN THE UPDATED TRAINING DATA')
lis=y_over.value_counts()
plt.figure()
plt.pie(lis,startangle=90,labels=[1,0])
plt.show()

In [None]:
#Splitting the data into training and testing data for the models
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x_over,y_over,test_size=0.2,random_state=45,stratify=y_over)

In [None]:
#making a backup copy for the dataset
CD_BK5=CD.copy()

In [None]:
#Normalizing the training and testing samples of the dependent variables' data
from sklearn.preprocessing import MinMaxScaler
mms=MinMaxScaler(feature_range=(0,1))
x_train=mms.fit_transform(x_train)
x_train=pd.DataFrame(x_train)
x_test=mms.fit_transform(x_test)
x_test=pd.DataFrame(x_test)

In [None]:
#Normalizing the sample of the dependent variables' data
from sklearn.preprocessing import MinMaxScaler
mms=MinMaxScaler(feature_range=(0,1))
x_over=mms.fit_transform(x_over)
x_over=pd.DataFrame(x_over)

In [None]:
#Reading the CSV file with the following values and importing them as headers and filling them with metrics which represent the performance of difference models while testing it with testing data
Res1=pd.read_csv("../input/results/Results.csv",header=0)
Res1.head()

In [None]:
#Creating objects of different Machine Learning models and fitting the training data and predicting the output of testing data and based on the original output of the testing data filling the metrics of the model into the already created dataframe
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from sklearn.ensemble import GradientBoostingClassifier
from lightgbm import LGBMClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
lro=LogisticRegression()
dto=DecisionTreeClassifier()
rfo=RandomForestClassifier()
knno=KNeighborsClassifier()
eto=ExtraTreesClassifier()
bco=BaggingClassifier()
gnbo=GaussianNB()
xgbo=XGBClassifier()
gbo=GradientBoostingClassifier()
lgbmo=LGBMClassifier()
oli=[lro,dto,rfo,knno,eto,bco,gnbo,xgbo,gbo,lgbmo]
for obj in oli:
    obj.fit(x_train,y_train)
    y_pred=obj.predict(x_test)
    ac=y_test
    pr=y_pred
    mat=confusion_matrix(ac,pr,labels=[0,1],sample_weight=None,normalize=None)
    tp,fn,fp,tn=confusion_matrix(ac,pr,labels=[0,1],sample_weight=None,normalize=None).reshape(-1)
    cr=classification_report(ac,pr)
    print('----------------------------------------------------------------------------------------')
    print('Model Name : ',obj)
    print('True Positive : ',tp)
    print('False Negative : ',fn)
    print('False Positive : ',fp)
    print('True Negative : ',tn)
    cvs=np.mean(cross_val_score(estimator=obj,X=x_over,y=y_over,cv=5))
    p=tp/(tp+fp)
    npv=tn/(tn+fn)
    s=tp/(tp+fn)
    sp=tn/(tn+fp)
    acc=(tp+tn)/(tp+fn+fp+tn)
    bacc=(s+sp)/2
    f1=2*((p*s)/(p+s))
    tmcc=np.sqrt(((tp+fp)*(tn+fn)*(tp+fn)*(tn+fp)))
    mcc=((tp*tn)-(fp*fn))/tmcc
    print("Precision : ",p)
    print("Negative Predictive Value : ",npv)
    print("Specificity : ",sp)
    print("Sensitivity : ",s)
    print("Balanced Accuracy : ",bacc)
    print("Accuracy : ",acc)
    print("F1 Score : ",f1)
    print("MCC : ",mcc)
    ras=roc_auc_score(ac,pr)
    print('ROC AUC Score : ',ras)
    print('Cross Validation Score : ',cvs)
    fpr,tpr,th=roc_curve(ac,obj.predict_proba(x_test)[:,1])
    plt.figure()
    plt.plot(fpr,tpr,label='Classification model (%f)'%ras)
    plt.plot([0,1],[0,1],'r--',label='Half Line')
    plt.xlim([0,1])
    plt.ylim([0,1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operation Characterestic Curve')
    plt.legend(loc='lower right')
    plt.show()
    nr={
        'Model_Name':obj,
        'True_Positive':round(tp,4),
        'False_Negative':round(fn,4),
        'False_Positive':round(fp,4),
        'True_Negative':round(fn,4),
        'Precision':round(p,4),
        'Sensitivity':round(s,4),
        'Specificity':round(sp,4),
        'Negative_Predictive_Value':round(npv,4),
        'Accuracy':round(acc,4),
        'Balanced_Accuracy':round(bacc,4),
        'F1_Score':round(f1,4),
        'MCC':round(mcc,4),
        'ROC_Score':round(ras,4),
        'Cross_Val_Score':round(cvs,4)

    }
    Res1=Res1.append(nr,ignore_index=True)

In [None]:
#Checking the metrics of the different models and selecting the one with the best ones
Res1.head(10)

In [None]:
#Dividing the dependent and independent variables' data
x=CD[CD.columns[CD.columns!='Loan_Status']]
y=CD[CD.columns[CD.columns=='Loan_Status']]

In [None]:
#Oversampling the training data so that a comparitibly equal number of records exist for each of the outcomes in the dataset
from imblearn.over_sampling import RandomOverSampler
roso=RandomOverSampler(sampling_strategy=0.6)
x_over,y_over=roso.fit_resample(x,y)

In [None]:
#Splitting the data into training and testing data for the models
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x_over,y_over,test_size=0.2,random_state=46)

In [None]:
#Normalizing the training and testing samples of the dependent variables' data
from sklearn.preprocessing import MinMaxScaler
mms=MinMaxScaler(feature_range=(0,1))
x_train=mms.fit_transform(x_train)
x_train=pd.DataFrame(x_train)
x_test=mms.fit_transform(x_test)
x_test=pd.DataFrame(x_test)

In [None]:
#Normalizing the sample of the dependent variables' data
from sklearn.preprocessing import MinMaxScaler
mms=MinMaxScaler(feature_range=(0,1))
x_over=mms.fit_transform(x_over)
x_over=pd.DataFrame(x_over)

In [None]:
#Reading the CSV file with the following values and importing them as headers and filling them with metrics which represent the performance of difference models while testing it with testing data
Res2=pd.read_csv("../input/results/Results.csv",header=0)
Res2.head()

In [None]:
#Creating objects of different Machine Learning models and fitting the training data and predicting the output of testing data and based on the original output of the testing data filling the metrics of the model into the already created dataframe
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from sklearn.ensemble import GradientBoostingClassifier
from lightgbm import LGBMClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
lro=LogisticRegression()
dto=DecisionTreeClassifier()
rfo=RandomForestClassifier()
knno=KNeighborsClassifier()
eto=ExtraTreesClassifier()
bco=BaggingClassifier()
gnbo=GaussianNB()
xgbo=XGBClassifier()
gbo=GradientBoostingClassifier()
lgbmo=LGBMClassifier()
oli=[lro,dto,rfo,knno,eto,bco,gnbo,xgbo,gbo,lgbmo]
for obj in oli:
    obj.fit(x_train,y_train)
    y_pred=obj.predict(x_test)
    ac=y_test
    pr=y_pred
    mat=confusion_matrix(ac,pr,labels=[0,1],sample_weight=None,normalize=None)
    tp,fn,fp,tn=confusion_matrix(ac,pr,labels=[0,1],sample_weight=None,normalize=None).reshape(-1)
    cr=classification_report(ac,pr)
    print('----------------------------------------------------------------------------------------')
    print('Model Name : ',obj)
    print('True Positive : ',tp)
    print('False Negative : ',fn)
    print('False Positive : ',fp)
    print('True Negative : ',tn)
    cvs=np.mean(cross_val_score(estimator=obj,X=x_over,y=y_over,cv=5))
    p=tp/(tp+fp)
    npv=tn/(tn+fn)
    s=tp/(tp+fn)
    sp=tn/(tn+fp)
    acc=(tp+tn)/(tp+fn+fp+tn)
    bacc=(s+sp)/2
    f1=2*((p*s)/(p+s))
    tmcc=np.sqrt(((tp+fp)*(tn+fn)*(tp+fn)*(tn+fp)))
    mcc=((tp*tn)-(fp*fn))/tmcc
    print("Precision : ",p)
    print("Negative Predictive Value : ",npv)
    print("Specificity : ",sp)
    print("Sensitivity : ",s)
    print("Balanced Accuracy : ",bacc)
    print("Accuracy : ",acc)
    print("F1 Score : ",f1)
    print("MCC : ",mcc)
    ras=roc_auc_score(ac,pr)
    print('ROC AUC Score : ',ras)
    print('Cross Validation Score : ',cvs)
    fpr,tpr,th=roc_curve(ac,obj.predict_proba(x_test)[:,1])
    plt.figure()
    plt.plot(fpr,tpr,label='Classification model (%f)'%ras)
    plt.plot([0,1],[0,1],'r--',label='Half Line')
    plt.xlim([0,1])
    plt.ylim([0,1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operation Characterestic Curve')
    plt.legend(loc='lower right')
    plt.show()
    nr={
        'Model_Name':obj,
        'True_Positive':round(tp,4),
        'False_Negative':round(fn,4),
        'False_Positive':round(fp,4),
        'True_Negative':round(fn,4),
        'Precision':round(p,4),
        'Sensitivity':round(s,4),
        'Specificity':round(sp,4),
        'Negative_Predictive_Value':round(npv,4),
        'Accuracy':round(acc,4),
        'Balanced_Accuracy':round(bacc,4),
        'F1_Score':round(f1,4),
        'MCC':round(mcc,4),
        'ROC_Score':round(ras,4),
        'Cross_Val_Score':round(cvs,4)

    }
    Res2=Res2.append(nr,ignore_index=True)

In [None]:
#Saving the Results dataframe into a CSV file and compare our results later
Res1.to_csv('Test_Results1.csv',header=True,index=False)

In [None]:
#Saving the Results dataframe into a CSV file and compare our results later
Res2.to_csv('Test_Results2.csv',header=True,index=False)

In [None]:
#Tuning the hyper parameters of the best model we got(ExtraTrees) for getting the maximum accuracy
from sklearn.model_selection import GridSearchCV
teto=ExtraTreesClassifier()
dict={
    'n_estimators':np.arange(70,130,10),
    'criterion':["gini", "entropy", "log_loss"],
    'class_weight' : ["balanced", "balanced_subsample"],
    'bootstrap':[True,False],
    'max_features':["sqrt", "log2", None]
}
gso=GridSearchCV(teto,dict,cv=5)

In [None]:
#Fitting the data into our GridSearchCV object to get our best parameters
gso.fit(x_over,y_over)

In [None]:
#Displaying the best parameters that we have obtained
gso.best_params_

In [None]:
#Creating an object of the model which we have selected with the best parameters we have got and check its accuracy
eto2=ExtraTreesClassifier(n_estimators=120,max_features='sqrt',criterion='gini',class_weight='balanced',bootstrap=False)
print(np.mean(cross_val_score(eto2,x_over,y_over,cv=5)))