## Importing The Packages

In [1]:
import numpy as np 
import pandas as pd

import warnings
warnings.filterwarnings('ignore')

## Reading The Dataset

In [2]:
df=pd.read_csv("loan_prediction.csv")
df

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoApplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y
...,...,...,...,...,...,...,...,...,...,...,...,...,...
609,LP002978,Female,No,0,Graduate,No,2900,0.0,71.0,360.0,1.0,Rural,Y
610,LP002979,Male,Yes,3+,Graduate,No,4106,0.0,40.0,180.0,1.0,Rural,Y
611,LP002983,Male,Yes,1,Graduate,No,8072,240.0,253.0,360.0,1.0,Urban,Y
612,LP002984,Male,Yes,2,Graduate,No,7583,0.0,187.0,360.0,1.0,Urban,Y


In [3]:
df

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoApplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y
...,...,...,...,...,...,...,...,...,...,...,...,...,...
609,LP002978,Female,No,0,Graduate,No,2900,0.0,71.0,360.0,1.0,Rural,Y
610,LP002979,Male,Yes,3+,Graduate,No,4106,0.0,40.0,180.0,1.0,Rural,Y
611,LP002983,Male,Yes,1,Graduate,No,8072,240.0,253.0,360.0,1.0,Urban,Y
612,LP002984,Male,Yes,2,Graduate,No,7583,0.0,187.0,360.0,1.0,Urban,Y


In [4]:
df.shape

(614, 13)

## Handling The Missing Values

In [5]:
#checking the null values
df.isnull().sum()

Loan_ID               0
Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
 ApplicantIncome      0
CoApplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

## Treating The Null Value

In [None]:
# We will fill the missing values in numeric data type using the mean value of that particular column and categorical data type using the most repeated value

In [6]:
numerical_features = df.select_dtypes(include = [np.number]).columns
categorical_features = df.select_dtypes(include = [np.object]).columns

In [7]:
numerical_features

Index([' ApplicantIncome', 'CoApplicantIncome', 'LoanAmount',
       'Loan_Amount_Term', 'Credit_History'],
      dtype='object')

In [8]:
categorical_features

Index(['Loan_ID', 'Gender', 'Married', 'Dependents', 'Education',
       'Self_Employed', 'Property_Area', 'Loan_Status'],
      dtype='object')

In [9]:
df['Gender'] = df['Gender'].fillna(df['Gender'].mode()[0])
df['Married'] = df['Married'].fillna(df['Married'].mode()[0])

In [10]:
#replace + with non value 
df['Dependents'] = df['Dependents'].str.replace('+','')

In [11]:
df['Dependents'] = df['Dependents'].fillna(df['Dependents'].mode()[0])
df['Self_Employed'] = df['Self_Employed'].fillna(df['Self_Employed'].mode()[0])
df['LoanAmount'] = df['LoanAmount'].fillna(df['LoanAmount'].mode()[0])
df['Loan_Amount_Term'] = df['Loan_Amount_Term'].fillna(df['Loan_Amount_Term'].mode()[0])
df['Credit_History'] = df['Credit_History'].fillna(df['Credit_History'].mode()[0])

In [12]:
#checking the null values now
df.isnull().sum()

Loan_ID              0
Gender               0
Married              0
Dependents           0
Education            0
Self_Employed        0
 ApplicantIncome     0
CoApplicantIncome    0
LoanAmount           0
Loan_Amount_Term     0
Credit_History       0
Property_Area        0
Loan_Status          0
dtype: int64

In [None]:
# The Null Values has been retreated now

## Handling Categorical Values

In [13]:
df.select_dtypes(include='object').columns

Index(['Loan_ID', 'Gender', 'Married', 'Dependents', 'Education',
       'Self_Employed', 'Property_Area', 'Loan_Status'],
      dtype='object')

In [14]:
df['Gender'].unique()

array(['Male', 'Female'], dtype=object)

In [15]:
df['Gender'].replace({'Male':1,'Female':0},inplace=True)

In [16]:
df['Married'].unique()

array(['No', 'Yes'], dtype=object)

In [17]:
df['Married'].replace({'Yes':1,'No':0},inplace=True)

In [18]:
df['Dependents'].unique()

array(['0', '1', '2', '3'], dtype=object)

In [19]:
df['Dependents'].replace({'0':0,'1':1,'2':2,'3':3},inplace=True)

In [20]:
df['Self_Employed'].unique()

array(['No', 'Yes'], dtype=object)

In [21]:
df['Self_Employed'].replace({'Yes':1,'No':0},inplace=True)

In [22]:
df['Property_Area'].unique()

array(['Urban', 'Rural', 'Semiurban'], dtype=object)

In [23]:
df['Property_Area'].replace({'Urban':2,'Rural':0,'Semiurban':1},inplace=True)

In [24]:
df['Loan_Status'].unique()

array(['Y', 'N'], dtype=object)

In [25]:
df['Loan_Status'].replace({'Y':1,'N':0},inplace=True)

In [26]:
df['Education'].unique()

array(['Graduate', 'Not Graduate'], dtype=object)

In [27]:
df['Education'].replace({'Graduate':1,'Not Graduate':0},inplace=True)

In [28]:
df['CoApplicantIncome']=df['CoApplicantIncome'].astype("int64")
df['LoanAmount']=df['LoanAmount'].astype("int64")
df['Loan_Amount_Term']=df['Loan_Amount_Term'].astype("int64")
df['Credit_History']=df['Credit_History'].astype("int64")

In [29]:
# dummy columns are created for the categories in Loan_ID
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df['Loan_ID'] = le.fit_transform(df.Loan_ID)

In [30]:
df.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoApplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,0,1,0,0,1,0,5849,0,120,360,1,2,1
1,1,1,1,1,1,0,4583,1508,128,360,1,0,0
2,2,1,1,0,1,1,3000,0,66,360,1,2,1
3,3,1,1,0,0,0,2583,2358,120,360,1,2,1
4,4,1,0,0,1,0,6000,0,141,360,1,2,1


In [31]:
df.tail()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoApplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
609,609,0,0,0,1,0,2900,0,71,360,1,0,1
610,610,1,1,3,1,0,4106,0,40,180,1,0,1
611,611,1,1,1,1,0,8072,240,253,360,1,2,1
612,612,1,1,2,1,0,7583,0,187,360,1,2,1
613,613,0,0,0,1,1,4583,0,133,360,0,1,0


# Balancing The Dataset

In [32]:
from imblearn.combine import SMOTETomek

In [33]:
smote = SMOTETomek(0.90)

In [34]:
#dividing the dataset into dependent and independent y and x respectively

y = df['Loan_Status']
x = df.drop(columns=['Loan_Status'],axis=1)

In [35]:
#creating the new x and y for balance data
x_bal,y_bal = smote.fit_resample(x,y)

In [36]:
#printing the value before and after balancing 
print(y.value_counts())
print(y_bal.value_counts())

1    422
0    192
Name: Loan_Status, dtype: int64
1    360
0    317
Name: Loan_Status, dtype: int64


## Scaling The Data

In [37]:
from sklearn.preprocessing import StandardScaler

In [38]:
sc = StandardScaler()
x_bal = sc.fit_transform(x_bal)

In [39]:
x_bal = pd.DataFrame(x_bal)

In [41]:
x_bal.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,-1.764089,0.533507,-1.194372,-0.710283,0.604086,-0.314684,0.08124,-0.5053,-0.292786,0.273236,0.579056,1.341256
1,-1.75828,0.533507,0.83726,0.339633,0.604086,-0.314684,-0.131588,0.001887,-0.189783,0.273236,0.579056,-1.223801
2,-1.752472,0.533507,0.83726,-0.710283,0.604086,3.177792,-0.397707,-0.5053,-0.988057,0.273236,0.579056,1.341256
3,-1.746664,0.533507,0.83726,-0.710283,-1.655395,-0.314684,-0.467809,0.287769,-0.292786,0.273236,0.579056,1.341256
4,-1.740856,0.533507,-1.194372,-0.710283,0.604086,-0.314684,0.106625,-0.5053,-0.022404,0.273236,0.579056,1.341256


# Splitting Data Into Train And Test

In [42]:
# splitting the data into training and testing set

from sklearn.model_selection import train_test_split

In [43]:
x_train, x_test, y_train, y_test = train_test_split(x_bal, y_bal, test_size = 0.33, random_state = 42)

In [44]:
print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)

(453, 12)
(453,)
(224, 12)
(224,)


In [45]:
x_train

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
291,0.216527,-1.874389,-1.194372,-0.710283,0.604086,-0.314684,-0.502272,-0.505300,-1.245564,0.273236,0.579056,-1.223801
221,-0.259750,0.533507,0.837260,2.439466,-1.655395,-0.314684,-0.073085,-0.505300,-0.189783,0.273236,0.579056,0.058727
616,0.483707,0.533507,-1.194372,-0.710283,0.604086,3.177792,0.819415,-0.505300,0.389608,0.273236,-1.726949,0.058727
662,-1.409785,0.533507,-1.194372,1.389550,-1.655395,-0.314684,-0.105867,-0.505300,-0.576045,0.273236,-1.726949,-1.223801
319,0.431433,0.533507,-1.194372,-0.710283,0.604086,-0.314684,-0.386107,-0.505300,-0.923680,2.090481,0.579056,1.341256
...,...,...,...,...,...,...,...,...,...,...,...,...
71,-1.282003,0.533507,0.837260,-0.710283,0.604086,-0.314684,0.106625,0.251445,1.574143,0.273236,0.579056,0.058727
106,-1.049673,0.533507,0.837260,1.389550,0.604086,-0.314684,-0.404936,-0.505300,-0.794926,0.273236,0.579056,0.058727
270,0.071321,0.533507,0.837260,1.389550,0.604086,-0.314684,-0.277172,-0.505300,-0.292786,0.273236,0.579056,0.058727
435,1.291055,0.533507,0.837260,-0.710283,0.604086,-0.314684,-0.267421,-0.505300,-0.421540,0.273236,0.579056,0.058727


In [46]:
x_test

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
646,-0.050653,-1.874389,0.837260,0.339633,0.604086,-0.314684,-0.133437,-0.505300,-0.498792,0.273236,0.579056,0.058727
336,0.565022,0.533507,-1.194372,-0.710283,-1.655395,-0.314684,-0.438894,-0.505300,-1.000932,-0.635387,0.579056,-1.223801
63,-1.340086,0.533507,0.837260,2.439466,-1.655395,-0.314684,-0.102673,-0.505300,-0.614671,0.273236,-1.726949,0.058727
367,0.785736,-1.874389,0.837260,-0.710283,-1.655395,3.177792,0.298608,-0.505300,-0.061030,0.273236,0.579056,-1.223801
101,-1.078714,0.533507,0.837260,-0.710283,0.604086,-0.314684,0.034001,0.215121,0.415359,0.273236,0.579056,-1.223801
...,...,...,...,...,...,...,...,...,...,...,...,...
33,-1.537567,0.533507,-1.194372,-0.710283,-1.655395,-0.314684,-0.271960,0.055700,-0.421540,0.273236,0.579056,0.058727
227,-0.219092,0.533507,0.837260,1.389550,0.604086,-0.314684,-0.110574,-0.038809,0.093475,0.273236,0.579056,0.058727
480,1.610509,0.533507,-1.194372,-0.710283,0.604086,-0.314684,-0.593388,10.875129,-0.679048,0.273236,0.579056,1.341256
9,-1.711814,0.533507,0.837260,0.339633,0.604086,-0.314684,1.256671,3.183578,2.655674,0.273236,0.579056,0.058727


In [47]:
y_train

291    0
221    0
616    0
662    0
319    0
      ..
71     0
106    1
270    1
435    1
102    1
Name: Loan_Status, Length: 453, dtype: int64