In [46]:
# This file contains a comparison between KNN, Decision Tree and Random Forest Classifier Algorithms, applied on German Credit Dataset
# to predict if a loan is good or bad. It also highlights which algorithm gives a better prediction based on accuracy scores.

# Import packages
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error

# To read CSV data
data = pd.read_csv("C:/Users/Aditi Arora/Documents/FORE/Trimester 3/MLP/archive/german_credit_data.csv")

In [47]:
# male = 0, female = 1
data['Sex'] = data['Sex'].replace({'male':0, 'female':1})
# own = 0, free = 1, rent = 2
data['Housing'] = data['Housing'].replace({'own':0, 'free':1, 'rent':2})
# NA = 0, little = 1, moderate = 2, quite rich = 3, rich = 4
data['Saving accounts'] = data['Saving accounts'].replace({'NA':0, 'little':1, 'moderate':2, 'quite rich':3, 'rich':4})
# NA = 0, little = 1, moderate = 2, rich = 3
data['Checking account'] = data['Checking account'].replace ({'NA':0, 'little':1, 'moderate':2, 'rich':3})
# Purpose = 0 to 7
data['Purpose'] = data['Purpose'].replace ({'business':0, 'car':1, 'domestic appliances':2, 'education':3, 'furniture/equipment':4, 'radio/TV':5, 'repairs':6, 'vacation/others':7})
# bad = 0, good = 1
data['Risk'] = data['Risk'].replace({'bad':0, 'good':1})

In [48]:
# Preview data
data.head(10)

Unnamed: 0.1,Unnamed: 0,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose,Risk
0,0,67,0,2,0,,1.0,1169,6,5,1
1,1,22,1,2,0,1.0,2.0,5951,48,5,0
2,2,49,0,1,0,1.0,,2096,12,3,1
3,3,45,0,2,1,1.0,1.0,7882,42,4,1
4,4,53,0,2,1,1.0,1.0,4870,24,1,0
5,5,35,0,1,1,,,9055,36,3,1
6,6,53,0,2,0,3.0,,2835,24,4,1
7,7,35,0,3,2,1.0,2.0,6948,36,1,1
8,8,61,0,1,0,4.0,,3059,12,5,1
9,9,28,0,3,0,1.0,2.0,5234,30,1,0


In [49]:
# Adjustments to missing values
train_data = data.copy()
train_data['Saving accounts'].fillna(train_data['Saving accounts'].value_counts().idxmax(), inplace=True)
train_data['Checking account'].fillna(train_data['Checking account'].value_counts().idxmax(), inplace=True)

In [50]:
# Preview data
train_data.head(10)

Unnamed: 0.1,Unnamed: 0,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose,Risk
0,0,67,0,2,0,1.0,1.0,1169,6,5,1
1,1,22,1,2,0,1.0,2.0,5951,48,5,0
2,2,49,0,1,0,1.0,1.0,2096,12,3,1
3,3,45,0,2,1,1.0,1.0,7882,42,4,1
4,4,53,0,2,1,1.0,1.0,4870,24,1,0
5,5,35,0,1,1,1.0,1.0,9055,36,3,1
6,6,53,0,2,0,3.0,1.0,2835,24,4,1
7,7,35,0,3,2,1.0,2.0,6948,36,1,1
8,8,61,0,1,0,4.0,1.0,3059,12,5,1
9,9,28,0,3,0,1.0,2.0,5234,30,1,0


In [51]:
# KNN Classifier
# Separate feature and target
X = train_data.iloc[:,1:-1]
y = train_data.iloc[:, -1]

In [52]:
# Split dataset into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 100)
X.head()

Unnamed: 0,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose
0,67,0,2,0,1.0,1.0,1169,6,5
1,22,1,2,0,1.0,2.0,5951,48,5
2,49,0,1,0,1.0,1.0,2096,12,3
3,45,0,2,1,1.0,1.0,7882,42,4
4,53,0,2,1,1.0,1.0,4870,24,1


In [53]:
knn = KNeighborsClassifier(n_neighbors=10)
knn.fit(X_train, y_train)

KNeighborsClassifier(n_neighbors=10)

In [54]:
y_pred = knn.predict(X_test)

In [55]:
# Model Accuracy
knnAcc = knn.score(X_test, y_test)
print("Accuracy is %.2f%%" %(knnAcc*100))

Accuracy is 66.00%


In [56]:
Age = input("Enter Applicant's Age: ")
Sex = input("Gender? \nInput 1 for Male and 0 for Female: ")
Job = input("Jobs Done? \nInput 0, 1, 2 or 3: ")
Hou = input ("Housing type? \nInput 0 for Own, 1 for Free and 2 for Rent: ")
SaveAcc = input ("Savings Account type? \nInput 1 for Little, 2 for Moderate, 3 for Quite Rich and 4 for Rich: ")
CheAcc = input ("Checking Account type? \nInput 1 for Little, 2 for Moderate, 3 for Rich: ")
Cred = input("Enter Credit Amount: ")
Dur = input("Enter Duration: ")
Pur = input("Purpose for loan? \nInput 0 for Business, 1 for Car, 2 for Domestic Appliances,"
            " 3 for Education, 4 for Furniture/Equipment, 5 for Radio/TV, 6 for Repairs, 7 for Vacation/Others: ")
X_actual_values = [Age, Sex, Job, Hou, SaveAcc, CheAcc, Cred, Dur, Pur]
X_actual_values

Enter Applicant's Age: 30
Gender? 
Input 1 for Male and 0 for Female: 1
Jobs Done? 
Input 0, 1, 2 or 3: 2
Housing type? 
Input 0 for Own, 1 for Free and 2 for Rent: 0
Savings Account type? 
Input 1 for Little, 2 for Moderate, 3 for Quite Rich and 4 for Rich: 2
Checking Account type? 
Input 1 for Little, 2 for Moderate, 3 for Rich: 3
Enter Credit Amount: 88000
Enter Duration: 4
Purpose for loan? 
Input 0 for Business, 1 for Car, 2 for Domestic Appliances, 3 for Education, 4 for Furniture/Equipment, 5 for Radio/TV, 6 for Repairs, 7 for Vacation/Others: 3


['30', '1', '2', '0', '2', '3', '88000', '4', '3']

In [57]:
X_actual_values = np.array(X_actual_values).astype('int16')
X_actual_values = X_actual_values.reshape(1,9)
X_actual_values = pd.DataFrame(X_actual_values)
X_actual_values.columns = (['Age', 'Sex', 'Job', 'Housing', 'Saving accounts',
'Checking account', 'Credit amount', 'Duration', 'Purpose'])
y_actual_pred = knn.predict(X_actual_values)
print('Should the person be given a loan? \n1 for Yes and 0 for No. \nAs per KNN, the answer is: ', y_actual_pred)

Should the person be given a loan? 
1 for Yes and 0 for No. 
As per KNN, the answer is:  [0]


In [58]:
# Decision Tree Classifier
# Separate feature and target
X = train_data.iloc[:,1:-1]
y = train_data.iloc[:, -1]

In [59]:
# Split dataset into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 1) # 70% training and 30% test

In [60]:
# Create Decision Tree classifer object
clf = DecisionTreeClassifier(max_depth=4)

# Train Decision Tree Classifer
clf = clf.fit(X_train, y_train)

# Predict the response for test dataset
y_pred = clf.predict(X_test)
print("y_pred: \n", y_pred)
print("y_test: \n", y_test)

y_pred: 
 [1 0 1 1 1 1 1 0 1 1 1 1 0 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 0 1
 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 0 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 0 1 1 1 1 0 1 0 1 1 1 1 1 1 1 1 1 1 1
 1 1 0 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 0 0 1 1 1 1 1 1 1 1 1 1
 1 1 1 0 1 1 1 1 0 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1
 1 0 1 1 1 1 1 0 1 1 1 1 1 0 1 1 1 0 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1]
y_test: 
 507    0
818    1
452    1
368    0
242    0
      ..
459    1
415    1
61     1
347    1
349    0
Name: Risk, Length: 300, dtype: int64


In [61]:
# Model Accuracy
dtAcc = metrics.accuracy_score(y_test, y_pred)
print("Accuracy is %.2f%%" %(dtAcc*100))

Accuracy is 70.00%


In [62]:
X_actual_values = np.array(X_actual_values).astype('int16')
X_actual_values = X_actual_values.reshape(1,9)
X_actual_values = pd.DataFrame(X_actual_values)
X_actual_values.columns = (['Age', 'Sex', 'Job', 'Housing', 'Saving accounts',
'Checking account', 'Credit amount', 'Duration', 'Purpose'])
y_actual_pred = clf.predict(X_actual_values)
print('Should the person be given a loan? \n1 for Yes and 0 for No. \nAs per Decision Tree, the answer is: ', y_actual_pred)

Should the person be given a loan? 
1 for Yes and 0 for No. 
As per Decision Tree, the answer is:  [1]


In [63]:
print(confusion_matrix(y_test, y_pred))

[[ 13  73]
 [ 17 197]]


In [64]:
f1_score(y_test, y_pred)

0.8140495867768595

In [65]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.43      0.15      0.22        86
           1       0.73      0.92      0.81       214

    accuracy                           0.70       300
   macro avg       0.58      0.54      0.52       300
weighted avg       0.64      0.70      0.64       300



In [71]:
# Random Forest Classifier
# split into inputs and outputs
X = train_data.iloc[:,1:-1]
y = train_data.iloc[:, -1]
print(X.shape, y.shape)
# split into train test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(1000, 9) (1000,)
(670, 9) (330, 9) (670,) (330,)


In [72]:
# fit the model
model = RandomForestClassifier(random_state=1)
model.fit(X_train, y_train)
# make predictions
y_pred = model.predict(X_test)
print(y_pred.shape)
print(y_test.shape)
print("y_pred: \n", y_pred)
print("y_test: \n", y_test)

(330,)
(330,)
y_pred: 
 [1 0 1 1 0 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 0 1 0 1 1 0 1 1 0 1 1 1 1
 1 1 1 1 1 1 0 1 0 1 1 1 1 1 1 0 0 0 0 1 0 1 1 1 1 0 1 1 1 0 1 0 0 1 1 1 1
 1 1 1 1 1 1 0 1 1 1 1 0 1 1 0 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1
 1 1 0 1 1 1 1 1 1 0 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1
 1 0 1 0 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 0 1 0 1 1 1 1 1 1 1 1 1 1 1 0 1 1
 1 1 1 1 0 1 1 1 1 1 1 1 0 1 1 0 1 1 1 1 1 0 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1
 1 1 1 0 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 0 1 1 1 0 1 1 0 1 0 0 1 0 0 1 0 1 1 1 1 1 1 1 1 0 1 1 1
 1 1 1 1 0 1 1 1 1 1 1 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0]
y_test: 
 507    0
818    1
452    1
368    0
242    0
      ..
817    1
495    0
66     1
403    1
563    0
Name: Risk, Length: 330, dtype: int64


In [73]:
# evaluate predictions
mae = mean_absolute_error(y_test, y_pred)
print('MAE: %.3f' % mae)
mse = mean_squared_error(y_test, y_pred)
print('MSE: %.3f' % mse)

MAE: 0.312
MSE: 0.312


In [74]:
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))
rfAcc = accuracy_score(y_test, y_pred)
print("Accuracy is %.2f%%" %(rfAcc*100))

[[ 22  70]
 [ 33 205]]
              precision    recall  f1-score   support

           0       0.40      0.24      0.30        92
           1       0.75      0.86      0.80       238

    accuracy                           0.69       330
   macro avg       0.57      0.55      0.55       330
weighted avg       0.65      0.69      0.66       330

Accuracy is 68.79%


In [75]:
X_actual_values = np.array(X_actual_values).astype('int16')
X_actual_values = X_actual_values.reshape(1,9)
X_actual_values = pd.DataFrame(X_actual_values)
X_actual_values.columns = (['Age', 'Sex', 'Job', 'Housing', 'Saving accounts',
'Checking account', 'Credit amount', 'Duration', 'Purpose'])
y_actual_pred = model.predict(X_actual_values)
print('Should the person be given a loan? \n1 for Yes and 0 for No. \nAs per Random Forest, the answer is: ', y_actual_pred)

Should the person be given a loan? 
1 for Yes and 0 for No. 
As per Random Forest, the answer is:  [0]


In [76]:
# Comparison between KNN, Decision Tree & Random Forest Classifier
if(dtAcc >= knnAcc):
    if(dtAcc >= rfAcc):
        {print('Decision Tree Classifier is the best with an accuracy score of %.2f%%' %(dtAcc*100))}
        
    else:
        {print('Random Forest Classifier is the best with an accuracy score of %.2f%%' %(rfAcc*100))}
        
else:
    if(knnAcc >= rfAcc):
        {print('KNN Classifier is the best with an accuracy score of %.2f%%' %(knnAcc*100))}
        
    else:
        {print('Random Forest Classifier is the best with an accuracy score of %.2f%%' %(rfAcc*100))}

Decision Tree Classifier is the best with an accuracy score of 70.00%
