In [82]:
#importing Libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [83]:
#Importing Dataset and Deleting missing rows
dataset = pd.read_csv('Loan Approval.csv')
dataset = dataset.dropna()

In [84]:
#Datatype Identification
dataset.dtypes

Loan_ID               object
Gender                object
Married               object
Dependents            object
Education             object
Self_Employed         object
ApplicantIncome        int64
CoapplicantIncome    float64
LoanAmount           float64
Loan_Amount_Term     float64
Credit_History       float64
Property_Area         object
Loan_Status           object
dtype: object

In [86]:
#Columns list
dataset.columns

Index(['Loan_ID', 'Gender', 'Married', 'Dependents', 'Education',
       'Self_Employed', 'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
       'Loan_Amount_Term', 'Credit_History', 'Property_Area', 'Loan_Status'],
      dtype='object')

In [87]:
#Removing Special Characters from the Columns and Changing its datatype to int
dataset=dataset.replace('\+','',regex=True)
dataset

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y
5,LP001011,Male,Yes,2,Graduate,Yes,5417,4196.0,267.0,360.0,1.0,Urban,Y
...,...,...,...,...,...,...,...,...,...,...,...,...,...
609,LP002978,Female,No,0,Graduate,No,2900,0.0,71.0,360.0,1.0,Rural,Y
610,LP002979,Male,Yes,3,Graduate,No,4106,0.0,40.0,180.0,1.0,Rural,Y
611,LP002983,Male,Yes,1,Graduate,No,8072,240.0,253.0,360.0,1.0,Urban,Y
612,LP002984,Male,Yes,2,Graduate,No,7583,0.0,187.0,360.0,1.0,Urban,Y


In [88]:
#LabelEncoding Categorical Variable
from sklearn.preprocessing import LabelEncoder
var_mod = ['Gender','Married', 'Dependents', 'Education',
       'Self_Employed', 'Property_Area' , 'Loan_Status']
le = LabelEncoder()
for i in var_mod:
  dataset[i] = le.fit_transform(dataset[i])

In [89]:
#Identify Highly Correlated Features
def remove_high_corr(dataset, threshold):
  del_cols = set() #To store the columns to be deleted
  cor_matrix = dataset.corr() #Getting correlation matrix
  for i in range(len(cor_matrix.columns)):
    for j in range(i):
      if(cor_matrix.iloc[i, j] > threshold) and (cor_matrix.columns[j] not in del_cols): 
        col = cor_matrix.columns[i]
        del_cols.add(col) #Extracting coluumns that has higher correlation than the threshold
        if col in dataset.columns:
          del dataset[col] #Dropping the extracted columns
  return(dataset)

In [91]:
#Removing Highly correlated features
dataset = remove_high_corr(dataset, 0.85)

In [92]:
#Feature and Label Identification
X = dataset.iloc[:,1:12].values
y = dataset.iloc[:, -1].values

In [93]:
#Datatype Conversion
X = X.astype(int)
print(X.dtype)

int64


In [94]:
#Standardising X data
from sklearn.preprocessing import StandardScaler
scl = StandardScaler()
scl.fit(X)


StandardScaler(copy=True, with_mean=True, with_std=True)

In [95]:
X = pd.DataFrame(scl.transform(X))
X

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,0.467198,0.737162,0.218599,-0.503253,-0.399275,-0.137970,-0.027951,-0.208089,0.275542,0.413197,-1.318868
1,0.467198,0.737162,-0.762033,-0.503253,2.504541,-0.417536,-0.604632,-0.979001,0.275542,0.413197,1.259774
2,0.467198,0.737162,-0.762033,1.987072,-0.399275,-0.491180,0.297101,-0.307562,0.275542,0.413197,1.259774
3,0.467198,-1.356553,-0.762033,-0.503253,-0.399275,0.112280,-0.604632,-0.046446,0.275542,0.413197,1.259774
4,0.467198,0.737162,1.199231,-0.503253,2.504541,0.009319,0.999978,1.520245,0.275542,0.413197,1.259774
...,...,...,...,...,...,...,...,...,...,...,...
475,-2.140419,-1.356553,-0.762033,-0.503253,-0.399275,-0.435196,-0.604632,-0.916831,0.275542,0.413197,-1.318868
476,0.467198,0.737162,2.179863,-0.503253,-0.399275,-0.222210,-0.604632,-1.302286,-2.487549,0.413197,-1.318868
477,0.467198,0.737162,0.218599,-0.503253,-0.399275,0.478206,-0.512853,1.346168,0.275542,0.413197,1.259774
478,0.467198,0.737162,1.199231,-0.503253,-0.399275,0.391846,-0.604632,0.525520,0.275542,0.413197,1.259774


In [14]:
#Logistics Regression

In [96]:
#Spilitting of dataset
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)

In [97]:
#Fitting Logistics Regsression model
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(random_state = 0)
classifier.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=0, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [98]:
#Confusion matrix
y_pree = classifier.predict(X_test)
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pree)
print(cm)
accuracy_score(y_test, y_pree)

[[15 27]
 [ 0 78]]


0.775

In [99]:
#Comparing y test and y pred results
y_pre_yts =np.concatenate((y_pree.reshape(len(y_pree),1), y_test.reshape(len(y_test),1)),1)
y_pre_yts

array([[1, 0],
       [1, 0],
       [1, 1],
       [1, 0],
       [1, 0],
       [1, 0],
       [1, 1],
       [1, 1],
       [1, 0],
       [1, 1],
       [1, 1],
       [1, 0],
       [0, 0],
       [1, 1],
       [1, 0],
       [1, 1],
       [1, 1],
       [1, 1],
       [0, 0],
       [1, 1],
       [1, 1],
       [1, 0],
       [1, 1],
       [1, 1],
       [1, 1],
       [1, 0],
       [1, 1],
       [1, 1],
       [1, 1],
       [1, 1],
       [1, 1],
       [1, 1],
       [1, 1],
       [1, 0],
       [1, 1],
       [1, 1],
       [1, 1],
       [1, 1],
       [1, 1],
       [0, 0],
       [1, 1],
       [1, 1],
       [1, 1],
       [1, 1],
       [1, 0],
       [1, 1],
       [1, 1],
       [1, 1],
       [1, 1],
       [1, 1],
       [1, 1],
       [1, 0],
       [1, 1],
       [1, 0],
       [1, 1],
       [1, 1],
       [1, 0],
       [0, 0],
       [0, 0],
       [0, 0],
       [1, 1],
       [1, 1],
       [0, 0],
       [1, 1],
       [1, 0],
       [1, 1],
       [1,

In [100]:
#Finding Y value using Logistic Regression
print(classifier.predict(scl.transform([[2 ,	5417 , 	4196 , 	267 ,	360 ,	1,1,1	,0,	1,	2]])))

[1]


In [101]:
#Evaluating Train and Test Accuracy
print(classifier.score(X_train,y_train))
print(classifier.score(X_test,y_test))

0.8166666666666667
0.775


In [102]:
#ROC Curve
from sklearn.metrics import roc_auc_score
roc_auc_score(y_test, y_pree)

0.6785714285714286

In [22]:
#Decision tree classification

In [103]:
#Fitting Decision Tree Classification model
from sklearn.tree import DecisionTreeClassifier
DecisionTree = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
DecisionTree.fit(X_train, y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='entropy',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=0, splitter='best')

In [104]:
#Confusion matrix between ytest and ypred
y_pre = DecisionTree.predict(X_test)
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pre)
print(cm)
accuracy_score(y_test, y_pre)

[[19 23]
 [18 60]]


0.6583333333333333

In [126]:
#Finding Y value using DecisionTree Classification
print(DecisionTree.predict(scl.transform([[2 ,	5417 , 	4196 , 	267 ,	360 ,	1,1,1	,0,	1,	2]])))


[0]


In [106]:
#Comparing y test and y pred esults
y_pre_yts =np.concatenate((y_pre.reshape(len(y_pre),1), y_test.reshape(len(y_test),1)),1)
y_pre_yts

array([[1, 0],
       [1, 0],
       [1, 1],
       [1, 0],
       [0, 0],
       [0, 0],
       [1, 1],
       [1, 1],
       [1, 0],
       [1, 1],
       [0, 1],
       [1, 0],
       [0, 0],
       [0, 1],
       [1, 0],
       [1, 1],
       [1, 1],
       [1, 1],
       [0, 0],
       [1, 1],
       [1, 1],
       [0, 0],
       [1, 1],
       [1, 1],
       [0, 1],
       [1, 0],
       [1, 1],
       [1, 1],
       [1, 1],
       [1, 1],
       [1, 1],
       [1, 1],
       [0, 1],
       [1, 0],
       [1, 1],
       [1, 1],
       [0, 1],
       [1, 1],
       [1, 1],
       [0, 0],
       [0, 1],
       [1, 1],
       [0, 1],
       [1, 1],
       [1, 0],
       [0, 1],
       [1, 1],
       [1, 1],
       [1, 1],
       [1, 1],
       [1, 1],
       [1, 0],
       [1, 1],
       [1, 0],
       [0, 1],
       [0, 1],
       [0, 0],
       [0, 0],
       [0, 0],
       [0, 0],
       [1, 1],
       [0, 1],
       [1, 0],
       [1, 1],
       [1, 0],
       [1, 1],
       [1,

In [107]:
#Evaluating Train and Test Accuracy
print(DecisionTree.score(X_train,y_train))
print(DecisionTree.score(X_test,y_test))

1.0
0.6583333333333333


In [108]:
#ROC Curve
from sklearn.metrics import roc_auc_score
roc_auc_score(y_test, y_pre)

0.6108058608058607

In [29]:
#Random_forest classification

In [110]:
#Fitting Random forest Classification model
from sklearn.ensemble import RandomForestClassifier
RandomForest = RandomForestClassifier()
RandomForest.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [111]:
#Hypermater tuning 
from pprint import pprint
from sklearn.model_selection import RandomizedSearchCV
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
pprint(random_grid)

{'bootstrap': [True, False],
 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None],
 'max_features': ['auto', 'sqrt'],
 'min_samples_leaf': [1, 2, 4],
 'min_samples_split': [2, 5, 10],
 'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]}


In [67]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestClassifier()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
rf_random.fit(X_train, y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  37 tasks      | elapsed:   48.6s
[Parallel(n_jobs=-1)]: Done 158 tasks      | elapsed:  3.3min
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:  6.2min finished


RandomizedSearchCV(cv=3, error_score=nan,
                   estimator=RandomForestClassifier(bootstrap=True,
                                                    ccp_alpha=0.0,
                                                    class_weight=None,
                                                    criterion='gini',
                                                    max_depth=None,
                                                    max_features='auto',
                                                    max_leaf_nodes=None,
                                                    max_samples=None,
                                                    min_impurity_decrease=0.0,
                                                    min_impurity_split=None,
                                                    min_samples_leaf=1,
                                                    min_samples_split=2,
                                                    min_weight_fraction_leaf=0.0,
               

In [68]:
rf_random.best_params_

{'bootstrap': False,
 'max_depth': 10,
 'max_features': 'sqrt',
 'min_samples_leaf': 2,
 'min_samples_split': 5,
 'n_estimators': 1600}

In [112]:
#Fitting Random forest Classification model
from sklearn.ensemble import RandomForestClassifier
RandomForest = RandomForestClassifier(bootstrap = 'False', max_depth = 10,
 max_features= 'sqrt',
 min_samples_leaf= 2,
 min_samples_split= 5,
 n_estimators= 1600)
RandomForest.fit(X_train, y_train)

RandomForestClassifier(bootstrap='False', ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=10, max_features='sqrt',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=2, min_samples_split=5,
                       min_weight_fraction_leaf=0.0, n_estimators=1600,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [113]:
#Confusion matrix between ytest and ypred
y_pred = RandomForest.predict(X_test)
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

[[16 26]
 [ 1 77]]


0.775

In [114]:
#Comparing y test and y pred results

ypp = np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1)

In [115]:
#Finding Y value using RandomForest Regression
print(RandomForest.predict(scl.transform([[2 ,	5417 , 	4196 , 	267 ,	360 ,	1,1,1	,0,	1,	2]])))


[1]


In [116]:
#Evaluating Train and Test Accuracy
print(RandomForest.score(X_train,y_train))
print(RandomForest.score(X_test,y_test))

0.8805555555555555
0.775


In [117]:
#ROC Curve
from sklearn.metrics import roc_auc_score
roc_auc_score(y_test, y_pred)

0.6840659340659341

In [36]:
#KNN algorithm

In [118]:
from sklearn.neighbors import KNeighborsClassifier
kmodel = KNeighborsClassifier()
kmodel.fit(X_train,y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')

In [119]:
#Hypermater tuning 
#List Hyperparameters that we want to tune.
from sklearn.model_selection import GridSearchCV
leaf_size = list(range(1,50))
n_neighbors = list(range(1,30))
p=[1,2]
#Convert to dictionary
hyperparameters = dict(leaf_size=leaf_size, n_neighbors=n_neighbors, p=p)
#Create new KNN object
knn_2 = KNeighborsClassifier()
#Use GridSearch
clf = GridSearchCV(knn_2, hyperparameters, cv=10)
#Fit the model
best_model = clf.fit(X_train,y_train)
#Print The value of best Hyperparameters
print('Best leaf_size:', best_model.best_estimator_.get_params()['leaf_size'])
print('Best p:', best_model.best_estimator_.get_params()['p'])
print('Best n_neighbors:', best_model.best_estimator_.get_params()['n_neighbors'])

Best leaf_size: 1
Best p: 1
Best n_neighbors: 12


In [120]:
from sklearn.neighbors import KNeighborsClassifier
kmodel = KNeighborsClassifier(leaf_size =1 , p=1,n_neighbors=12)
kmodel.fit(X_train,y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=1, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=12, p=1,
                     weights='uniform')

In [121]:
#Confusion matrix between ytest and ypred
y_predd = kmodel.predict(X_test)
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_predd)
print(cm)
accuracy_score(y_test, y_predd)

[[14 28]
 [ 0 78]]


0.7666666666666667

In [77]:
#Comparing y test and y pred results

yppp = np.concatenate((y_predd.reshape(len(y_predd),1), y_test.reshape(len(y_test),1)),1)
yppp

array([[1, 0],
       [1, 0],
       [1, 1],
       [1, 0],
       [1, 0],
       [1, 0],
       [1, 1],
       [1, 1],
       [1, 0],
       [1, 1],
       [1, 1],
       [1, 0],
       [1, 0],
       [1, 1],
       [1, 0],
       [1, 1],
       [1, 1],
       [1, 1],
       [1, 0],
       [1, 1],
       [1, 1],
       [1, 0],
       [1, 1],
       [1, 1],
       [1, 1],
       [0, 0],
       [1, 1],
       [1, 1],
       [1, 1],
       [1, 1],
       [1, 1],
       [1, 1],
       [1, 1],
       [1, 0],
       [1, 1],
       [1, 1],
       [1, 1],
       [1, 1],
       [1, 1],
       [0, 0],
       [1, 1],
       [1, 1],
       [1, 1],
       [1, 1],
       [1, 0],
       [1, 1],
       [1, 1],
       [1, 1],
       [1, 1],
       [1, 1],
       [1, 1],
       [1, 0],
       [1, 1],
       [1, 0],
       [1, 1],
       [1, 1],
       [1, 0],
       [0, 0],
       [0, 0],
       [0, 0],
       [1, 1],
       [1, 1],
       [0, 0],
       [1, 1],
       [1, 0],
       [1, 1],
       [1,

In [78]:
#Finding Y value using KNN Algorithm
print(kmodel.predict(scl.transform([[2 ,	5417 , 	4196 , 	267 ,	360 ,	1,1,1	,0,	1,	2]])))


[1]


In [79]:
#Evaluating Train and Test Accuracy
print(kmodel.score(X_train,y_train))
print(kmodel.score(X_test,y_test))

0.8055555555555556
0.7666666666666667


In [80]:
#ROC Curve
from sklearn.metrics import roc_auc_score
roc_auc_score(y_test, y_predd)

0.6666666666666667

In [125]:
a=[roc_auc_score(y_test, y_predd),roc_auc_score(y_test, y_pree),roc_auc_score(y_test, y_pre),roc_auc_score(y_test, y_pred)]
print("ROC Curve Value Results")
print("ROC Value of KNN",roc_auc_score(y_test, y_predd))
print("ROC Value of Logistic Regression",roc_auc_score(y_test, y_pree))
print("ROC Value of Decision Tree Classification",roc_auc_score(y_test, y_pre))
print("ROC Value of Random Forest Classification",roc_auc_score(y_test, y_pred))
print("The Highest Accuracy value ", max(a))

ROC Curve Value Results
ROC Value of KNN 0.6666666666666667
ROC Value of Logistic Regression 0.6785714285714286
ROC Value of Decision Tree Classification 0.6108058608058607
ROC Value of Random Forest Classification 0.6840659340659341
The Highest Accuracy value  0.6840659340659341
