In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder, OneHotEncoder, MinMaxScaler
from scipy import stats

In [2]:
#accessing the dataset through data_analysation file
dataset = pd.read_csv("../Dataset/loan_data_set.csv")
dataset.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


## Data Preprocessing

In [3]:
dataset.shape

(614, 13)

In [4]:
# Checking the number of missing values in each column
dataset.isnull().sum()

Loan_ID               0
Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

### Taking care of missing data

In [5]:
# Drop Loan_ID, Credit_History and Loan_Amount_Term Columns
dataset = dataset.drop(['Loan_ID'], axis = 1)

# Drop rows where Gender, Married and Self_Employed columns values are null
dataset = dataset.dropna(subset=['Gender', 'Married', 'Self_Employed'])

# Replace Mode value for missing values
dataset['Dependents'].fillna(dataset['Dependents'].mode()[0],inplace=True)
dataset['Loan_Amount_Term'].fillna(dataset['Loan_Amount_Term'].mode()[0],inplace=True)
dataset['Credit_History'].fillna(dataset['Credit_History'].mode()[0],inplace=True)

# Replace Mean value for missing values
dataset['LoanAmount'].fillna(dataset['LoanAmount'].mean(),inplace=True)

In [6]:
dataset.isnull().sum()

Gender               0
Married              0
Dependents           0
Education            0
Self_Employed        0
ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
Loan_Amount_Term     0
Credit_History       0
Property_Area        0
Loan_Status          0
dtype: int64

### Encoding categorical data

In [7]:
labelencoder = LabelEncoder()
dataset['Gender'] = labelencoder.fit_transform(dataset.Gender.values)
dataset['Married'] = labelencoder.fit_transform(dataset.Married.values)
dataset['Education'] = labelencoder.fit_transform(dataset.Education.values)
dataset['Self_Employed'] = labelencoder.fit_transform(dataset.Self_Employed.values)
dataset['Property_Area'] = labelencoder.fit_transform(dataset.Property_Area.values)
dataset['Loan_Status'] = labelencoder.fit_transform(dataset.Loan_Status.values)

In [8]:
onehotencoder = OneHotEncoder(sparse=False)
dataset[['Rural', 'Semiurban', 'Urban']] = onehotencoder.fit_transform(dataset.Property_Area.values.reshape(-1,1))

# Drop Property_Area Column
dataset = dataset.drop(['Property_Area'], axis = 1)

In [9]:
dataset.Dependents.value_counts(dropna=False)

0     331
1      95
2      94
3+     46
Name: Dependents, dtype: int64

In [10]:
# Replace 3+ to 3
dataset['Dependents'] = dataset['Dependents'].replace(['3+'], 3).astype('int')

In [11]:
# remove outliers 
dataset = dataset[(np.abs(stats.zscore(dataset)) < 3).all(axis=1)]

In [12]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 534 entries, 0 to 613
Data columns (total 14 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Gender             534 non-null    int32  
 1   Married            534 non-null    int32  
 2   Dependents         534 non-null    int32  
 3   Education          534 non-null    int32  
 4   Self_Employed      534 non-null    int32  
 5   ApplicantIncome    534 non-null    int64  
 6   CoapplicantIncome  534 non-null    float64
 7   LoanAmount         534 non-null    float64
 8   Loan_Amount_Term   534 non-null    float64
 9   Credit_History     534 non-null    float64
 10  Loan_Status        534 non-null    int32  
 11  Rural              534 non-null    float64
 12  Semiurban          534 non-null    float64
 13  Urban              534 non-null    float64
dtypes: float64(7), int32(6), int64(1)
memory usage: 50.1 KB


### Normalized Data

In [13]:
mmScaler = MinMaxScaler()
dataset[['ApplicantIncome','CoapplicantIncome','LoanAmount','Loan_Amount_Term']] = mmScaler.fit_transform(dataset[['ApplicantIncome','CoapplicantIncome','LoanAmount','Loan_Amount_Term']])

### Preprocess CSV file

In [14]:
dataset.to_csv('../Dataset/preprocess_loan_data_set.csv', index=False)
