In [2]:
# ------------------------------------------------------------------
# Build the Support Vector Classifier Model
# Predict the loan approval status based on 
# Gender, Marital Status, Credit History, Income and Loan Amount
# ------------------------------------------------------------------

# Import Libraries and read csv file
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt  
import seaborn as sns 
from sklearn.model_selection import train_test_split 
from sklearn import metrics
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
import statistics as st

In [4]:
#find out columns with missing values
df=pd.read_csv("01Exercise1.csv")
df

Unnamed: 0,gender,married,ch,income,loanamt,status
0,Male,No,1.0,5849,,Y
1,Male,Yes,1.0,4583,128.0,N
2,Male,Yes,1.0,3000,66.0,Y
3,Male,Yes,1.0,2583,120.0,Y
4,Male,No,1.0,6000,141.0,Y
...,...,...,...,...,...,...
609,Female,No,1.0,2900,71.0,Y
610,Male,Yes,1.0,4106,40.0,Y
611,Male,Yes,1.0,8072,253.0,Y
612,Male,Yes,1.0,7583,187.0,Y


In [5]:
df.isnull().sum()

gender     13
married     3
ch         50
income      0
loanamt    22
status      0
dtype: int64

In [6]:
# Replace Missing Values. Drop the rows.
df.gender.fillna(df.gender.mode()[0],inplace=True)
df.married.fillna(df.married.mode()[0],inplace=True)
df.ch.fillna(df.ch.mode()[0],inplace=True)
df.loanamt.fillna(df.loanamt.mode()[0],inplace=True)

In [7]:
# Drop irrelevant columns based on business sense
df=df.drop('status',axis=1)

In [8]:
# Create Dummy variables


In [9]:
df=pd.get_dummies(df)

In [10]:
# Normalize the data (Income and Loan Amount) Using StandardScaler
from sklearn.preprocessing import StandardScaler
scaler_=StandardScaler()
scaler_

StandardScaler(copy=True, with_mean=True, with_std=True)

In [11]:
df['income'] = scaler_.fit_transform(df[['income']])
df['loanamt'] = scaler_.fit_transform(df[['loanamt']])
df

Unnamed: 0,ch,income,loanamt,gender_Female,gender_Male,married_No,married_Yes
0,1.0,0.072991,-0.302759,0,1,1,0
1,1.0,-0.134412,-0.207648,0,1,0,1
2,1.0,-0.393747,-0.944757,0,1,0,1
3,1.0,-0.462062,-0.302759,0,1,0,1
4,1.0,0.097728,-0.053093,0,1,1,0
...,...,...,...,...,...,...,...
609,1.0,-0.410130,-0.885313,1,0,1,0
610,1.0,-0.212557,-1.253868,0,1,0,1
611,1.0,0.437174,1.278459,0,1,0,1
612,1.0,0.357064,0.493794,0,1,0,1


In [12]:
# Create the X (Independent) and Y (Dependent) dataframes
X = df.iloc[:, :-1]
X

Unnamed: 0,ch,income,loanamt,gender_Female,gender_Male,married_No
0,1.0,0.072991,-0.302759,0,1,1
1,1.0,-0.134412,-0.207648,0,1,0
2,1.0,-0.393747,-0.944757,0,1,0
3,1.0,-0.462062,-0.302759,0,1,0
4,1.0,0.097728,-0.053093,0,1,1
...,...,...,...,...,...,...
609,1.0,-0.410130,-0.885313,1,0,1
610,1.0,-0.212557,-1.253868,0,1,0
611,1.0,0.437174,1.278459,0,1,0
612,1.0,0.357064,0.493794,0,1,0


In [13]:
Y=df.iloc[:, -1:]
Y

Unnamed: 0,married_Yes
0,0
1,1
2,1
3,1
4,0
...,...
609,0
610,1
611,1
612,1


In [14]:
# Split the X and Y dataset into training and testing set
x_train,x_test,y_train,y_test=train_test_split(X,Y,test_size=0.3)

In [21]:
# Import and build Support Vector Classifier
from sklearn import svm
svm_model=svm.SVC()
svm_model.fit(x_train,y_train)

  y = column_or_1d(y, warn=True)


SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [26]:
# Predict the outcome using Test data
logmodel = LogisticRegression()
logmodel.fit(x_train, y_train)

  y = column_or_1d(y, warn=True)


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [27]:
prediction=logmodel.predict(x_test)

In [16]:
# Build the conufsion matrix and get the accuracy/score
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

In [29]:
ac=accuracy_score(y_test,prediction)
ac

1.0

In [30]:
matrix=confusion_matrix(y_test,prediction)

In [31]:
matrix

array([[ 63,   0],
       [  0, 122]], dtype=int64)