## **Home Loan Dataset**

### **Phase 1: Data Collection and Preparation**

In [137]:
# importing necessary libraries
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

### Task 1.1: Load the Home Loan dataset into a Pandas DataFrame.

In [None]:
# The URL of the raw train data CSV file
url = 'https://raw.githubusercontent.com/ek-chris/Practice_datasets/refs/heads/main/home_loan_train.csv'

# Read the CSV file directly into a pandas DataFrame
df = pd.read_csv(url)
print("Dataset loaded successfully!")
print("\nFirst 5 rows of the DataFrame:")
df.head()

# Create a copy
df_processed = df.copy()
df_processed.head()

Dataset loaded successfully!

First 5 rows of the DataFrame:


Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


### Task 1.2: Inspect the dataset for missing values, duplicates, and data type inconsistencies.

In [139]:
#Getting the count of missing values for every column
print("\nMissing Values Count per Column")
missing_values = df_processed.isnull().sum()
missing_values


Missing Values Count per Column


Loan_ID               0
Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

In [140]:
# Getting the total count of duplicated values
total_duplicates = df_processed.duplicated().sum()
print(f"\nTotal Number of Duplicated values: {total_duplicates}")


Total Number of Duplicated values: 0


In [141]:
df_processed.columns

Index(['Loan_ID', 'Gender', 'Married', 'Dependents', 'Education',
       'Self_Employed', 'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
       'Loan_Amount_Term', 'Credit_History', 'Property_Area', 'Loan_Status'],
      dtype='object')

In [142]:
# checking the data types
df_processed.dtypes

Loan_ID               object
Gender                object
Married               object
Dependents            object
Education             object
Self_Employed         object
ApplicantIncome        int64
CoapplicantIncome    float64
LoanAmount           float64
Loan_Amount_Term     float64
Credit_History       float64
Property_Area         object
Loan_Status           object
dtype: object

In [143]:
# checking for missing values
df_processed.isnull().sum()

Loan_ID               0
Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

### Task 1.3: Clean the dataset by handling missing values, correcting data types, and addressing outliers.

In [144]:
# checking the shape of the data
df_processed.shape

(614, 13)

#### **Handling Missing Values**


In [145]:
# setting load_id as the index
df_processed.set_index("Loan_ID", inplace = True)

In [146]:
# coverting the Loan AMount Term to object
df_processed['Loan_Amount_Term'] = df_processed["Loan_Amount_Term"].astype(str)

In [147]:
# mapping loan_status from object to int to check correlation 
df_processed["Loan_Status"] = df_processed["Loan_Status"].map({"Y":1,"N": 0})

In [148]:
# mapping credit history from int to object to check correlation
# df_processed["Credit_History"] = df_processed["Credit_History"].map({1.0:"good", 0.0:"bad"})

In [149]:
# displaying the categorical columns
categorical = df_processed.select_dtypes(include =["object"]).columns
categorical

Index(['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed',
       'Loan_Amount_Term', 'Property_Area'],
      dtype='object')

In [150]:
# filling the missing values in the categorical columns
for col in categorical:
    mode_value = df_processed[col].mode()[0]
    df_processed[col].fillna(mode_value, inplace = True)
    df_processed[col].replace("nan", mode_value, inplace=True)
    print(f"Filled missing values in '{col}' with Mode: {mode_value}")

Filled missing values in 'Gender' with Mode: Male
Filled missing values in 'Married' with Mode: Yes
Filled missing values in 'Dependents' with Mode: 0
Filled missing values in 'Education' with Mode: Graduate
Filled missing values in 'Self_Employed' with Mode: No
Filled missing values in 'Loan_Amount_Term' with Mode: 360.0
Filled missing values in 'Property_Area' with Mode: Semiurban


In [151]:
df_processed.isna().sum()

Gender                0
Married               0
Dependents            0
Education             0
Self_Employed         0
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term      0
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

In [152]:
# checking numerical columns
numerical = df_processed.select_dtypes(include=["int64", "float64"]).columns
numerical

Index(['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount', 'Credit_History',
       'Loan_Status'],
      dtype='object')

In [153]:
for col in numerical:
    median_value = df_processed[col].median()
    df_processed[col].fillna(median_value, inplace = True)
    df_processed[col].replace("nan", median_value, inplace=True)
    print(f"Filled missing values in '{col}' with Median: {median_value}")

Filled missing values in 'ApplicantIncome' with Median: 3812.5
Filled missing values in 'CoapplicantIncome' with Median: 1188.5
Filled missing values in 'LoanAmount' with Median: 128.0
Filled missing values in 'Credit_History' with Median: 1.0
Filled missing values in 'Loan_Status' with Median: 1.0


In [154]:
#removing the + from 3+ to ensure the data is clean
df_processed['Dependents'] = df_processed['Dependents'].str.replace('+', '')

#Converting to integer datatype
df_processed['Dependents'] =(df_processed['Dependents']).astype('int64')

In [155]:
df_processed.isnull().sum()

Gender               0
Married              0
Dependents           0
Education            0
Self_Employed        0
ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
Loan_Amount_Term     0
Credit_History       0
Property_Area        0
Loan_Status          0
dtype: int64

In [156]:
df.dtypes

Loan_ID               object
Gender                object
Married               object
Dependents            object
Education             object
Self_Employed         object
ApplicantIncome        int64
CoapplicantIncome    float64
LoanAmount           float64
Loan_Amount_Term     float64
Credit_History       float64
Property_Area         object
Loan_Status           object
dtype: object

In [157]:
df_processed


Unnamed: 0_level_0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
Loan_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
LP001002,Male,No,0,Graduate,No,5849,0.0,128.0,360.0,1.0,Urban,1
LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,0
LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,1
LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,1
LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,1
...,...,...,...,...,...,...,...,...,...,...,...,...
LP002978,Female,No,0,Graduate,No,2900,0.0,71.0,360.0,1.0,Rural,1
LP002979,Male,Yes,3,Graduate,No,4106,0.0,40.0,180.0,1.0,Rural,1
LP002983,Male,Yes,1,Graduate,No,8072,240.0,253.0,360.0,1.0,Urban,1
LP002984,Male,Yes,2,Graduate,No,7583,0.0,187.0,360.0,1.0,Urban,1


In [158]:
loan_data =df_processed.to_csv("cleaned_home_data.csv", index = True)