In [1]:
#Import necessary libraries

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import time
import scipy.stats as stat

### **Phase 1**: Data Collection and Preparation

**Task 1.1**: Data Collection

In [2]:
train_data = r'https://raw.githubusercontent.com/ek-chris/Practice_datasets/refs/heads/main/home_loan_train.csv'
test_data =r'https://raw.githubusercontent.com/ek-chris/Practice_datasets/refs/heads/main/home_loan_test.csv'

In [3]:
df1 = pd.read_csv(train_data)
df2 = pd.read_csv(test_data)

In [4]:
# Save the dataset
dataset = pd.concat([df1, df2], ignore_index=True)

# Load the dataset

#Create a copy of the data to preserve the original dataset
df = dataset.copy()

#Checkthe fist five rows of the data
df.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


 From the head view, the following observations were made:

- There are missing values in some credit history records
- There are otliers (0.0) in the coapplicant income
There are missing values in the Loan Status

In [5]:
df.tail()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
976,LP002971,Male,Yes,3+,Not Graduate,Yes,4009,1777.0,113.0,360.0,1.0,Urban,
977,LP002975,Male,Yes,0,Graduate,No,4158,709.0,115.0,360.0,1.0,Urban,
978,LP002980,Male,No,0,Graduate,No,3250,1993.0,126.0,360.0,,Semiurban,
979,LP002986,Male,Yes,0,Graduate,No,5000,2393.0,158.0,360.0,1.0,Rural,
980,LP002989,Male,No,0,Graduate,Yes,9200,0.0,98.0,180.0,1.0,Rural,


From the tail view, it is observed that,

- There seems to be wide range of figures of coapplicant income loan amount

**Task1.2**: **Preliminary Data Analysis**

In [6]:
# Check the dimension

df.shape

(981, 13)

In [7]:
# See what the dataset is made of
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 981 entries, 0 to 980
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            981 non-null    object 
 1   Gender             957 non-null    object 
 2   Married            978 non-null    object 
 3   Dependents         956 non-null    object 
 4   Education          981 non-null    object 
 5   Self_Employed      926 non-null    object 
 6   ApplicantIncome    981 non-null    int64  
 7   CoapplicantIncome  981 non-null    float64
 8   LoanAmount         954 non-null    float64
 9   Loan_Amount_Term   961 non-null    float64
 10  Credit_History     902 non-null    float64
 11  Property_Area      981 non-null    object 
 12  Loan_Status        614 non-null    object 
dtypes: float64(4), int64(1), object(8)
memory usage: 99.8+ KB


From the information summary displayed above, the following observations were made

- Some columns have missing values
- The dependents data type is objects (should be converted to integer)
- The coapplicant income data type is float(should be converted to integer)


**Data Cleaning**

In [16]:

df['Dependents'] = df['Dependents'].replace('3+', 3)


In [18]:
df['Dependents'] = df['Dependents'].astype('int8')

ValueError: cannot convert float NaN to integer

In [None]:
# Check for duplicate items
df.duplicated().sum()

np.int64(0)

In [None]:
# Check if any difference has been effected
df.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [None]:
# Check for missing values

df.isna().sum()

Loan_ID                0
Gender                24
Married                3
Dependents            25
Education              0
Self_Employed         55
ApplicantIncome        0
CoapplicantIncome      0
LoanAmount            27
Loan_Amount_Term      20
Credit_History        79
Property_Area          0
Loan_Status          367
dtype: int64

According to the information the following columns have missing values:

- Gender : 24
- Married : 3
- Dependents : 25
- Self-Employed : 55
- Loan Amount : 27
- Loan Amount Term : 20
- Credit History : 79
- Loan Status : 367

We will fix each column one after the other.

In [None]:
# To fix missing columns in Gender, I will use the mode

df['Gender'] = df['Gender'].fillna(df['Gender'].mode()[0])

df.isnull().sum()

Loan_ID                0
Gender                 0
Married                3
Dependents            25
Education              0
Self_Employed         55
ApplicantIncome        0
CoapplicantIncome      0
LoanAmount            27
Loan_Amount_Term      20
Credit_History        79
Property_Area          0
Loan_Status          367
dtype: int64

In [None]:
# To fix missing columns in Gender, I will also use the mode, because it is the column contains categorical data

df['Married'] = df['Married'].fillna(df['Married'].mode()[0])

df.isnull().sum()

Loan_ID                0
Gender                 0
Married                0
Dependents            25
Education              0
Self_Employed         55
ApplicantIncome        0
CoapplicantIncome      0
LoanAmount            27
Loan_Amount_Term      20
Credit_History        79
Property_Area          0
Loan_Status          367
dtype: int64