In [28]:
# This file contains a comparison between KNN and Decision Tree Classifier Algorithms, applied on Loan Approval Dataset
# to predict if a loan should be approved. It also highlights which algorithm gives a better prediction based on accuracy scores.

# Import packages
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
from sklearn.metrics import confusion_matrix
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report

# To read CSV data
data = pd.read_csv("C:/Users/Aditi Arora/Documents/FORE/Trimester 3/MLP/archive/train_u6lujuX_CVtuZ9i (1).csv")

In [29]:
# Male = 0, Female = 1
data['Gender'] = data['Gender'].replace({'Male':0, 'Female':1,'Unknown':2})
# Yes = 1, No = 0
data['Married'] = data['Married'].replace({'Yes':1, 'No':0, 'Unknown':2})
# Graduate = 1, Not Graduate = 0
data['Education'] = data['Education'].replace({'Graduate':1, 'Not Graduate':0})
# Yes = 1
# No = 0
# Unknown = 2
data['Self_Employed'] = data['Self_Employed'].replace ({'Yes':1, 'No':0, 'Unknown':2})
data['Property_Area'] = data['Property_Area'].replace ({'Semiurban':1, 'Urban': 0, 'Rural':2})
data['Loan_Status'] = data['Loan_Status'].replace({'Y':1, 'N':0})
data['Dependents'] = data['Dependents'].replace({'3+':3})

In [30]:
# Preview data
data.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,0.0,0.0,0,1,0.0,5849,0.0,,360.0,1.0,0,1
1,LP001003,0.0,1.0,1,1,0.0,4583,1508.0,128.0,360.0,1.0,2,0
2,LP001005,0.0,1.0,0,1,1.0,3000,0.0,66.0,360.0,1.0,0,1
3,LP001006,0.0,1.0,0,0,0.0,2583,2358.0,120.0,360.0,1.0,0,1
4,LP001008,0.0,0.0,0,1,0.0,6000,0.0,141.0,360.0,1.0,0,1


In [31]:
# Adjustments to missing values
train_data = data.copy()
train_data['Gender'].fillna(train_data['Gender'].value_counts().idxmax(), inplace=True)
train_data['Married'].fillna(train_data['Married'].value_counts().idxmax(), inplace=True)
train_data['Dependents'].fillna(train_data['Dependents'].value_counts().idxmax(), inplace=True)
train_data['Self_Employed'].fillna(train_data['Self_Employed'].value_counts().idxmax(), inplace=True)
train_data["LoanAmount"].fillna(train_data["LoanAmount"].mean(skipna=True), inplace=True)
train_data['Loan_Amount_Term'].fillna(train_data['Loan_Amount_Term'].value_counts().idxmax(), inplace=True)
train_data['Credit_History'].fillna(train_data['Credit_History'].value_counts().idxmax(), inplace=True)

In [32]:
# Separate feature and target
X = train_data.iloc[:,1:-1]
y = train_data.iloc[:, -1]

In [33]:
# Split dataset into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 100)
X.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
0,0.0,0.0,0,1,0.0,5849,0.0,146.412162,360.0,1.0,0
1,0.0,1.0,1,1,0.0,4583,1508.0,128.0,360.0,1.0,2
2,0.0,1.0,0,1,1.0,3000,0.0,66.0,360.0,1.0,0
3,0.0,1.0,0,0,0.0,2583,2358.0,120.0,360.0,1.0,0
4,0.0,0.0,0,1,0.0,6000,0.0,141.0,360.0,1.0,0


In [34]:
knn = KNeighborsClassifier(n_neighbors=10)
knn.fit(X_train, y_train)

KNeighborsClassifier(n_neighbors=10)

In [50]:
y_pred = knn.predict(X_test)

In [51]:
# Model Accuracy
knnAcc = knn.score(X_test, y_test)
print("Accuracy is %.2f%%" %(knnAcc*100))

Accuracy is 63.24%


In [54]:
Gen = input("Gender? \nInput 1 for Male and 0 for Female: ")
Marr = input("If marrried? \nInput 1 for Yes and 0 for No: ")
Depen = input("Dependents present? \nInput 1 or 2 or 3+ (in case of more than 3): ")
Edu = input ("Education level? \nInput 0 for Not Graduate and 1 for Graduate: ")
SelfEmp = input("Self employed? \nInput 1 for Yes and 0 for No: ")
AppInc = input("Enter Applicant Income: ")
CoApInc = input("Enter Co-Applicant Income: ")
LoAmt = input("Enter Loan Amount: ")
LoAmtTerm = input("Enter Loan Amount Term: ")
Crehis = input("Enter Credit History: ")
PropAre = input("Enter Property Area: 1 for Urban and 0 for Rural: ")
X_actual_values = [Gen, Marr, Depen, Edu, SelfEmp, AppInc, CoApInc, LoAmt, LoAmtTerm, Crehis, PropAre]
X_actual_values

Gender? 
Input 1 for Male and 0 for Female: 0
If marrried? 
Input 1 for Yes and 0 for No: 0
Dependents present? 
Input 1 or 2 or 3+ (in case of more than 3): 2
Education level? 
Input 0 for Not Graduate and 1 for Graduate: 1
Self employed? 
Input 1 for Yes and 0 for No: 0
Enter Applicant Income: 40000
Enter Co-Applicant Income: 1500
Enter Loan Amount: 500000
Enter Loan Amount Term: 4
Enter Credit History: 1
Enter Property Area: 1 for Urban and 0 for Rural: 1


['0', '0', '2', '1', '0', '40000', '1500', '500000', '4', '1', '1']

In [55]:
X_actual_values = np.array(X_actual_values).astype('int16')
X_actual_values = X_actual_values.reshape(1,11)
X_actual_values = pd.DataFrame(X_actual_values)
X_actual_values.columns = (['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed',
'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
'Loan_Amount_Term', 'Credit_History', 'Property_Area'])
y_actual_pred = knn.predict(X_actual_values)
print('Should the person be given a loan? \n1 for Yes and 0 for No. \nAs per KNN the answer is: ', y_actual_pred)

Should the person be given a loan? 
1 for Yes and 0 for No. 
As per KNN the answer is:  [1]


In [56]:
# Separate feature and target
X = train_data.iloc[:,1:-1]
y = train_data.iloc[:, -1]

In [57]:
# Split dataset into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 1) # 70% training and 30% test

In [58]:
# Create Decision Tree classifer object
clf = DecisionTreeClassifier(max_depth=4)

# Train Decision Tree Classifer
clf = clf.fit(X_train, y_train)

# Predict the response for test dataset
y_pred = clf.predict(X_test)

In [59]:
# Model Accuracy
dtAcc = metrics.accuracy_score(y_test, y_pred)
print("Accuracy is %.2f%%" %(dtAcc*100))

Accuracy is 78.38%


In [60]:
print(confusion_matrix(y_test, y_pred))

[[ 25  36]
 [  4 120]]


In [61]:
f1_score(y_test, y_pred)

0.8571428571428572

In [62]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.86      0.41      0.56        61
           1       0.77      0.97      0.86       124

    accuracy                           0.78       185
   macro avg       0.82      0.69      0.71       185
weighted avg       0.80      0.78      0.76       185



In [64]:
# Comparison between KNN & Decision Tree Classifier
if(dtAcc > knnAcc):
    {print('Decision Tree Classifier is better than KNN Classifier with an accuracy score of %.2f%%' %(dtAcc*100))}
else:
    {print('KNN Classifier is better than Decision Tree Classifier with an accuracy score of %.2f%%' %(knnAcc*100))}

Decision Tree Classifier is better than KNN Classifier with an accuracy score of 78.38%
