# Data Cleaning

## Setting up Environment

In [1]:
# External Python Packages
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()

# Internal Python Packages
import os

In [2]:
# Cleaning loan test csv and saving it as a new csv 
loan_df = pd.read_csv('loan.csv', index_col=0)

## Preliminary Data Understanding

In [3]:
# Understanding the Data Attributes
loan_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 614 entries, LP001002 to LP002990
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Gender             601 non-null    object 
 1   Married            611 non-null    object 
 2   Dependents         599 non-null    object 
 3   Education          614 non-null    object 
 4   Self_Employed      582 non-null    object 
 5   ApplicantIncome    614 non-null    int64  
 6   CoapplicantIncome  614 non-null    float64
 7   LoanAmount         592 non-null    float64
 8   Loan_Amount_Term   600 non-null    float64
 9   Credit_History     564 non-null    float64
 10  Property_Area      614 non-null    object 
 11  Loan_Status        614 non-null    object 
dtypes: float64(4), int64(1), object(7)
memory usage: 62.4+ KB


## Data Cleaning

Getting Rid of Null Values in the dataset. 

In [4]:
loan_df.dropna(inplace=True) # drop rows with missing values

In [5]:
# # Changing Categorical Data to Numerical For Machine Learning
# loan_df = pd.get_dummies(loan_df)

#### Cleaning Data Columns

Cleaning Gender Column

In [6]:
loan_df.Gender.value_counts() 

Male      394
Female     86
Name: Gender, dtype: int64

In [7]:
# Encoding Gender Column to binary data for Machine learning Purposes
loan_df['Gender'] = label_encoder.fit_transform(loan_df['Gender'])

# Male encodes to 1;
# Female encodes to 0;

In [8]:
loan_df.Gender.value_counts()

1    394
0     86
Name: Gender, dtype: int64

Cleaning Marreid Column

In [9]:
loan_df.Married.value_counts()

Yes    311
No     169
Name: Married, dtype: int64

In [10]:
# Encoding Gender Column to binary data for Machine learning Purposes
loan_df['Married'] = label_encoder.fit_transform(loan_df['Married'])

# yes encodes to 1;
# No encodes to 0;

In [11]:
loan_df.Married.value_counts()

1    311
0    169
Name: Married, dtype: int64

Cleaning Dependents Column

In [12]:
loan_df.Dependents.value_counts()

0     274
2      85
1      80
3+     41
Name: Dependents, dtype: int64

In [13]:
# Turn 3+ in Dependents column into 3 
loan_df.Dependents = loan_df.Dependents.replace('3+', "3")
loan_df.Dependents.value_counts()

0    274
2     85
1     80
3     41
Name: Dependents, dtype: int64

In [14]:
# Turning string value in Dependents into integers
loan_df["Dependents"] = loan_df["Dependents"].astype(np.int8)

# Although 3 means 3 or more, the data will be encoded as 3. 

Cleaning Education Column

In [15]:
# Understanding Education Attribute
loan_df.Education.value_counts()

Graduate        383
Not Graduate     97
Name: Education, dtype: int64

In [16]:
# Encoding Education Column to binary data for Machine learning Purposes
loan_df['Education'] = label_encoder.fit_transform(loan_df['Education'])


# Encoding Script 
# Graduate = 1
# Not Graduate = 0
loan_df.Education.value_counts()

0    383
1     97
Name: Education, dtype: int64

Self Employed Column Cleaning 

In [17]:
loan_df.Self_Employed.value_counts()

No     414
Yes     66
Name: Self_Employed, dtype: int64

In [18]:
# Encoding Self_Employed Column to binary data for Machine learning Purposes
loan_df['Self_Employed'] = label_encoder.fit_transform(loan_df['Self_Employed'])

# Encoding Script 
# Yes = 1
# No = 0
loan_df.Self_Employed.value_counts()

0    414
1     66
Name: Self_Employed, dtype: int64

Cleaning Property Area

In [19]:
loan_df.Property_Area.value_counts()

Semiurban    191
Urban        150
Rural        139
Name: Property_Area, dtype: int64

In [20]:
# Encoding Data. 
property_mapping = {
           'Urban': 0,
           'Semiurban': 1,
           'Rural': 2}

loan_df['Property_Area'] = loan_df['Property_Area'].map(property_mapping)

loan_df.Property_Area.value_counts()

1    191
0    150
2    139
Name: Property_Area, dtype: int64

In [21]:
# Cleaning Classification Variable
loan_df.Loan_Status.value_counts()

# Encoding Loan_Status Column to binary data for Machine learning Purposes
loan_df['Loan_Status'] = label_encoder.fit_transform(loan_df['Loan_Status'])


## Exporting Data

In [22]:

# Saving the cleaned dataframe as a new csv
loan_df.to_csv('loan_cleaned.csv')