# Data Cleaning

## Setting up Environment

In [192]:
# External Python Packages
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()

# Internal Python Packages
import os

In [193]:
# Determine if we are working with Testing or Training Dataset
Testing = True
Training = False

In [194]:

if Testing:
    # Importing the test dataset, and saving it
    loan_df = pd.read_csv('loan-test.csv', index_col=0)

if Training:
    # Cleaning loan test csv and saving it as a new csv 
    loan_df = pd.read_csv('loan-train.csv', index_col=0)

## Preliminary Data Understanding

In [195]:
# Understanding the Data Attributes
loan_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 367 entries, LP001015 to LP002989
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Gender             356 non-null    object 
 1   Married            367 non-null    object 
 2   Dependents         357 non-null    object 
 3   Education          367 non-null    object 
 4   Self_Employed      344 non-null    object 
 5   ApplicantIncome    367 non-null    int64  
 6   CoapplicantIncome  367 non-null    int64  
 7   LoanAmount         362 non-null    float64
 8   Loan_Amount_Term   361 non-null    float64
 9   Credit_History     338 non-null    float64
 10  Property_Area      367 non-null    object 
dtypes: float64(3), int64(2), object(6)
memory usage: 34.4+ KB


## Data Cleaning

Getting Rid of Null Values in the dataset. 

In [196]:
loan_df.dropna(inplace=True) # drop rows with missing values

In [197]:
# # Changing Categorical Data to Numerical For Machine Learning
# loan_df = pd.get_dummies(loan_df)

#### Cleaning Data Columns

Cleaning Gender Column

In [198]:
loan_df.Gender.value_counts() 

Male      230
Female     59
Name: Gender, dtype: int64

In [199]:
# Encoding Gender Column to binary data for Machine learning Purposes
loan_df['Gender'] = label_encoder.fit_transform(loan_df['Gender'])

# Male encodes to 1;
# Female encodes to 0;

In [200]:
loan_df.Gender.value_counts()

1    230
0     59
Name: Gender, dtype: int64

Cleaning Marreid Column

In [201]:
loan_df.Married.value_counts()

Yes    187
No     102
Name: Married, dtype: int64

In [202]:
# Encoding Gender Column to binary data for Machine learning Purposes
loan_df['Married'] = label_encoder.fit_transform(loan_df['Married'])

# yes encodes to 1;
# No encodes to 0;

In [203]:
loan_df.Married.value_counts()

1    187
0    102
Name: Married, dtype: int64

Cleaning Dependents Column

In [204]:
loan_df.Dependents.value_counts()

0     167
2      50
1      42
3+     30
Name: Dependents, dtype: int64

In [205]:
# Turn 3+ in Dependents column into 3 
loan_df.Dependents = loan_df.Dependents.replace('3+', "3")
loan_df.Dependents.value_counts()

0    167
2     50
1     42
3     30
Name: Dependents, dtype: int64

In [206]:
# Turning string value in Dependents into integers
loan_df["Dependents"] = loan_df["Dependents"].astype(np.int8)

# Although 3 means 3 or more, the data will be encoded as 3. 

Cleaning Education Column

In [207]:
# Understanding Education Attribute
loan_df.Education.value_counts()

Graduate        224
Not Graduate     65
Name: Education, dtype: int64

In [208]:
# Encoding Education Column to binary data for Machine learning Purposes
loan_df['Education'] = label_encoder.fit_transform(loan_df['Education'])


# Encoding Script 
# Graduate = 1
# Not Graduate = 0
loan_df.Education.value_counts()

0    224
1     65
Name: Education, dtype: int64

Self Employed Column Cleaning 

In [209]:
loan_df.Self_Employed.value_counts()

No     257
Yes     32
Name: Self_Employed, dtype: int64

In [210]:
# Encoding Self_Employed Column to binary data for Machine learning Purposes
loan_df['Self_Employed'] = label_encoder.fit_transform(loan_df['Self_Employed'])

# Encoding Script 
# Yes = 1
# No = 0
loan_df.Self_Employed.value_counts()

0    257
1     32
Name: Self_Employed, dtype: int64

Cleaning Property Area

In [211]:
loan_df.Property_Area.value_counts()

Urban        113
Rural         89
Semiurban     87
Name: Property_Area, dtype: int64

In [212]:
# Encoding Data. 
property_mapping = {
           'Urban': 0,
           'Semiurban': 1,
           'Rural': 2}

loan_df['Property_Area'] = loan_df['Property_Area'].map(property_mapping)

loan_df.Property_Area.value_counts()

0    113
2     89
1     87
Name: Property_Area, dtype: int64

In [213]:
if Training:
    # Cleaning Classification Variable
    loan_df.Loan_Status.value_counts()

    # Encoding Loan_Status Column to binary data for Machine learning Purposes
    loan_df['Loan_Status'] = label_encoder.fit_transform(loan_df['Loan_Status'])


## Exporting Data

In [214]:
 if Training:
    # Saving the cleaned dataframe as a new csv
    loan_df.to_csv('loan_train_cleaned.csv')

if Testing:
    loan_df.to_csv('loan_test_cleaned.csv')