#Getting the system ready and loading the data

In [None]:
#!pip install skimpy



In [None]:
!pip install category_encoders



In [1]:
import pandas as pd
import numpy as np
import seaborn as sn
from sklearn import svm
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import skimpy as sk
from category_encoders import OneHotEncoder
import matplotlib.pyplot as plt
import plotly.express as px

In [3]:
loan_df = pd.read_csv("./data-for-project-1/raw_data.csv")
loan_df.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


#Exploratory Data Analysis

In [None]:
loan_df.describe()

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History
count,614.0,614.0,592.0,600.0,564.0
mean,5403.459283,1621.245798,146.412162,342.0,0.842199
std,6109.041673,2926.248369,85.587325,65.12041,0.364878
min,150.0,0.0,9.0,12.0,0.0
25%,2877.5,0.0,100.0,360.0,1.0
50%,3812.5,1188.5,128.0,360.0,1.0
75%,5795.0,2297.25,168.0,360.0,1.0
max,81000.0,41667.0,700.0,480.0,1.0


In [4]:
labels = (
    loan_df['Loan_Status']
    .astype('str')
    .str.replace('0','No', regex=True)
    .str.replace('1','Yes', regex=True)
    .value_counts()
)

fig = px.bar(
    data_frame=labels,
    x=labels.index,
    y=labels.values,
    title=f'Class Imbalance',
    color=labels.index
)

fig.show()

#Missing value and outlier treatment

In [None]:
loan_df.isnull().sum()

Loan_ID               0
Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

In [None]:
#dropping the missing values
loan_df = loan_df.dropna()

In [None]:
loan_df.isnull().sum()
#output is zero columns with missing values

Loan_ID              0
Gender               0
Married              0
Dependents           0
Education            0
Self_Employed        0
ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
Loan_Amount_Term     0
Credit_History       0
Property_Area        0
Loan_Status          0
dtype: int64

In [None]:
sk.skim(loan_df)

In [None]:
loan_df.dtypes

Loan_ID               object
Gender                object
Married               object
Dependents            object
Education             object
Self_Employed         object
ApplicantIncome        int64
CoapplicantIncome    float64
LoanAmount           float64
Loan_Amount_Term     float64
Credit_History       float64
Property_Area         object
Loan_Status           object
dtype: object

In [None]:
loan_encode = OneHotEncoder(use_cat_names = True, cols= ["Education", "Married", "Gender", "Self_Employed", "Property_Area", "Loan_Status", "Dependents"])

encoded = loan_encode.fit_transform(loan_df)
encoded.head()
#use replace

Unnamed: 0,Loan_ID,Gender_Male,Gender_Female,Married_Yes,Married_No,Dependents_1,Dependents_0,Dependents_2,Dependents_3+,Education_Graduate,...,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area_Rural,Property_Area_Urban,Property_Area_Semiurban,Loan_Status_N,Loan_Status_Y
1,LP001003,1,0,1,0,1,0,0,0,1,...,4583,1508.0,128.0,360.0,1.0,1,0,0,1,0
2,LP001005,1,0,1,0,0,1,0,0,1,...,3000,0.0,66.0,360.0,1.0,0,1,0,0,1
3,LP001006,1,0,1,0,0,1,0,0,0,...,2583,2358.0,120.0,360.0,1.0,0,1,0,0,1
4,LP001008,1,0,0,1,0,1,0,0,1,...,6000,0.0,141.0,360.0,1.0,0,1,0,0,1
5,LP001011,1,0,1,0,0,0,1,0,1,...,5417,4196.0,267.0,360.0,1.0,0,1,0,0,1


In [None]:
loan_df.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y
5,LP001011,Male,Yes,2,Graduate,Yes,5417,4196.0,267.0,360.0,1.0,Urban,Y


In [None]:
loan_df.replace({"Married": {"Yes":1, "No":0}, "Gender":{"Male":1, "Female": 0}, "Dependents":{"3+":3}, "Education":{"Graduate":1,"Not Graduate":0}, "Property_Area": {"Urban":2, "Semiurban":1, "Rural":0}, "Loan_Status": {"Y":1, "N":0}, "Self_Employed": {"Yes":1, "No":0}}, inplace=True)

In [None]:
loan_df.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
1,LP001003,1,1,1,1,0,4583,1508.0,128.0,360.0,1.0,0,0
2,LP001005,1,1,0,1,1,3000,0.0,66.0,360.0,1.0,2,1
3,LP001006,1,1,0,0,0,2583,2358.0,120.0,360.0,1.0,2,1
4,LP001008,1,0,0,1,0,6000,0.0,141.0,360.0,1.0,2,1
5,LP001011,1,1,2,1,1,5417,4196.0,267.0,360.0,1.0,2,1


In [None]:
X = loan_df.drop(columns = ["Loan_ID", "Loan_Status"], axis =1)
Y = loan_df["Loan_Status"]

In [None]:
print(X)
print(Y)

     Gender  Married Dependents  Education  Self_Employed  ApplicantIncome  \
1         1        1          1          1              0             4583   
2         1        1          0          1              1             3000   
3         1        1          0          0              0             2583   
4         1        0          0          1              0             6000   
5         1        1          2          1              1             5417   
..      ...      ...        ...        ...            ...              ...   
609       0        0          0          1              0             2900   
610       1        1          3          1              0             4106   
611       1        1          1          1              0             8072   
612       1        1          2          1              0             7583   
613       0        0          0          1              1             4583   

     CoapplicantIncome  LoanAmount  Loan_Amount_Term  Credit_Hi

Train test split

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=2)
print(X.shape, X_train.shape, X_test.shape)

(480, 11) (384, 11) (96, 11)


In [None]:
classifier = svm.SVC(kernel="linear")

In [None]:
classifier.fit(X_train, Y_train)

In [None]:
loan_predic = classifier.predict(X_train)
Data_acu = accuracy_score(loan_predic, Y_train)

In [None]:
print("Accuracy: ",Data_acu)

Accuracy:  0.7864583333333334
