### 1.Load data files(train_u6lujuX_CVtuZ9i.csv and test_Y3wMUE5_7gLdaTN.csv)

In [1]:
import pandas as pd  
import numpy as np  
import matplotlib.pyplot as plt  
import seaborn as sns 
from sklearn.model_selection import train_test_split 
from sklearn import metrics
%matplotlib inline

In [2]:
datatrain=pd.read_csv('train_u6lujuX_CVtuZ9i.csv')

In [3]:
datatest=pd.read_csv('test_Y3wMUE5_7gLdaTN.csv')

In [4]:
datatrain.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [5]:
datatrain.shape

(614, 13)

### 2.Types of data columns

In [6]:
datatrain.dtypes

Loan_ID               object
Gender                object
Married               object
Dependents            object
Education             object
Self_Employed         object
ApplicantIncome        int64
CoapplicantIncome    float64
LoanAmount           float64
Loan_Amount_Term     float64
Credit_History       float64
Property_Area         object
Loan_Status           object
dtype: object

### DATA CLEANING AND PREPROCESSING

### 3.Find missing values

In [7]:
datatrain.isnull().sum()

Loan_ID               0
Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

### 4.Impute missing values with mean (numerical variables)

In [8]:
datatrain['LoanAmount'].fillna(datatrain['LoanAmount'].mean(), inplace=True)

In [9]:
datatrain['Loan_Amount_Term'].fillna(datatrain['Loan_Amount_Term'].mean(), inplace=True)

In [10]:
datatrain.isnull().sum()

Loan_ID               0
Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount            0
Loan_Amount_Term      0
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

### 5.Impute missing values with mode (categorical variables)

In [11]:
datatrain['Gender'].fillna(datatrain['Gender'].mode().iloc[0], inplace=True)

In [12]:
datatrain['Dependents'].fillna(datatrain['Dependents'].mode().iloc[0], inplace=True)

In [13]:
datatrain['Self_Employed'].fillna(datatrain['Self_Employed'].mode().iloc[0], inplace=True)

In [14]:
datatrain['Credit_History'].fillna(datatrain['Credit_History'].mode().iloc[0], inplace=True)

In [15]:
datatrain['Married'].fillna(datatrain['Married'].mode().iloc[0], inplace=True)

In [16]:
datatrain.isnull().sum()

Loan_ID              0
Gender               0
Married              0
Dependents           0
Education            0
Self_Employed        0
ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
Loan_Amount_Term     0
Credit_History       0
Property_Area        0
Loan_Status          0
dtype: int64

### PREDICTIVE MODELLING

### 6.Remove Loan_ID variable - Irrelevant

In [17]:
datatrain.drop('Loan_ID', axis=1, inplace=True)

In [18]:
datatrain.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,Male,No,0,Graduate,No,5849,0.0,146.412162,360.0,1.0,Urban,Y
1,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


### 7.Create target variable

### 8.Build dummy variables for categorical variables

In [19]:
Gender = pd.get_dummies(datatrain['Gender'],drop_first=True)

In [20]:
Education = pd.get_dummies(datatrain['Education'],drop_first=True)

In [21]:
Property_Area = pd.get_dummies(datatrain['Property_Area'],drop_first=True)

In [22]:
datatrain.drop(['Gender','Education','Property_Area', 'Married','Self_Employed', 'Dependents'],axis=1,inplace=True)


In [23]:
datatrain.replace({'Loan_Status': 'Y'}, 1, inplace=True)
datatrain.replace({'Loan_Status': 'N'}, 0, inplace=True)
datatrain = pd.concat([datatrain,Gender,Education,Property_Area,],axis=1)
datatrain.head()


Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Loan_Status,Male,Not Graduate,Semiurban,Urban
0,5849,0.0,146.412162,360.0,1.0,1,1,0,0,1
1,4583,1508.0,128.0,360.0,1.0,0,1,0,0,0
2,3000,0.0,66.0,360.0,1.0,1,1,0,0,1
3,2583,2358.0,120.0,360.0,1.0,1,1,1,0,1
4,6000,0.0,141.0,360.0,1.0,1,1,0,0,1


In [24]:
X=datatrain.drop('Loan_Status',axis=1)
X

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Male,Not Graduate,Semiurban,Urban
0,5849,0.0,146.412162,360.0,1.0,1,0,0,1
1,4583,1508.0,128.000000,360.0,1.0,1,0,0,0
2,3000,0.0,66.000000,360.0,1.0,1,0,0,1
3,2583,2358.0,120.000000,360.0,1.0,1,1,0,1
4,6000,0.0,141.000000,360.0,1.0,1,0,0,1
...,...,...,...,...,...,...,...,...,...
609,2900,0.0,71.000000,360.0,1.0,0,0,0,0
610,4106,0.0,40.000000,180.0,1.0,1,0,0,0
611,8072,240.0,253.000000,360.0,1.0,1,0,0,1
612,7583,0.0,187.000000,360.0,1.0,1,0,0,1


In [25]:
Y=datatrain['Loan_Status']
Y

0      1
1      0
2      1
3      1
4      1
      ..
609    1
610    1
611    1
612    1
613    0
Name: Loan_Status, Length: 614, dtype: int64

### 9.Split train data for cross validation

In [26]:
from sklearn.model_selection import train_test_split

In [27]:
X_train, X_test, y_train, y_test=train_test_split(X,Y, test_size=0.30, random_state=101)


### (a)LOGISTIC REGRESSION ALGORITHM

### 10.Fit model

In [28]:
from sklearn.linear_model import LogisticRegression
logmodel = LogisticRegression()
logmodel.fit(X_train, y_train)

LogisticRegression()

### 11.Predict values for cv data

In [29]:
print(logmodel.score(X_train, y_train))
print(logmodel.score(X_test, y_test))

0.8251748251748252
0.7783783783783784


In [30]:
predictions = logmodel.predict(X_test)
predictions

array([1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1,
       1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1,
       1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1,
       0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1,
       1, 0, 1, 1, 1, 1, 1, 1, 1], dtype=int64)

### 12.Print classification report

In [31]:
from sklearn.metrics import classification_report
print(classification_report(y_test,predictions))

              precision    recall  f1-score   support

           0       0.90      0.41      0.56        64
           1       0.76      0.98      0.85       121

    accuracy                           0.78       185
   macro avg       0.83      0.69      0.71       185
weighted avg       0.80      0.78      0.75       185



### 13.Evaluate accuracy of model

In [32]:
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_test, predictions))

[[ 26  38]
 [  3 118]]


In [33]:
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
roc_auc_score(y_test,predictions)

0.6907283057851239

### (b)DECISION TREE ALGORITHM

### 14.Fit model

In [34]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split 
from sklearn import metrics 

In [35]:
clf = DecisionTreeClassifier()
clf = clf.fit(X_train,y_train)


### 15.Predict values for cv data

In [36]:
y_pred = clf.predict(X_test)
y_pred

array([0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1,
       0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1,
       1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1,
       1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0,
       1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1,
       0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1,
       1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1,
       1, 0, 1, 1, 1, 0, 1, 1, 1], dtype=int64)

### 16.Evaluate accuracy of model

In [37]:
print("Dogruluq:",metrics.accuracy_score(y_test, y_pred))

Dogruluq: 0.6594594594594595


### (c)SUPPORT VECTOR MACHINE (SVM) ALGORITHM

### 17.Fit model

In [38]:
from sklearn import svm
clf = svm.SVC(kernel='linear')
clf.fit(X_train, y_train)

SVC(kernel='linear')

### 18.Predict values for cv data

In [39]:
y_pred = clf.predict(X_test)
y_pred

array([1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1,
       1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1,
       0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1,
       1, 0, 1, 1, 1, 1, 1, 1, 1], dtype=int64)

### 19.Evaluate accuracy of model

In [40]:
from sklearn import metrics
print("Dogruluq:",metrics.accuracy_score(y_test, y_pred))

Dogruluq: 0.7297297297297297


### (d)NAIVE BAYES ALGORITHM

### 20.Fit model

In [41]:
from sklearn.naive_bayes import GaussianNB
NBmodel = GaussianNB()
NBmodel.fit(X_train, y_train)

GaussianNB()

### 21.Predict values for cv data

In [42]:
predicted = NBmodel.predict(X_test)
predicted

array([1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1,
       1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1,
       1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1,
       0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1,
       1, 0, 1, 1, 1, 1, 1, 1, 1], dtype=int64)

### 22.Evaluate accuracy of model

In [43]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
NB = accuracy_score(y_test,predicted)
print('Accuracy Score for Naive Bayes:',NB )

Accuracy Score for Naive Bayes: 0.7783783783783784


In [1]:
#Select best model in order of accuracy
#Naive Bayes - 77 (best model)
#Logistic Regression - 69
#Decision Tree - 67
#Support Vector Machine - 72