Import Libraries and Load Data

In [12]:
# core libraries
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import zipfile
import warnings
warnings.filterwarnings('ignore')

# preprocessing libraries
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.model_selection import train_test_split, StratifiedGroupKFold
from sklearn.feature_selection import SelectKBest, f_classif, mutual_info_classif
from sklearn.metrics import classification_report, confusion_matrix

# statistical libraries
from scipy import stats
from scipy.stats import zscore, skew

# set style for better visualisations
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

print("Libraries imported successfully!")




Libraries imported successfully!


In [13]:
# load in the dataset
data= 'cleaned_homeloan_dataset.csv'

In [14]:
df= pd.read_csv(data)

In [15]:
df.head(2)

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,Male,No,0,Graduate,No,5849,0.0,146.001808,360.0,1.0,Urban,Y
1,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N


2. EDA-Based Data Quality Assessment

Based on EDA findings, let's assess the specific issues identified 

In [22]:
num_cols= df.select_dtypes(include=[np.number]).columns
num_cols

Index(['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
       'Loan_Amount_Term', 'Credit_History'],
      dtype='object')

In [30]:
# create a copy for preprocessing 
df_processed = df.copy()

numerical= df_processed[num_cols]

# 1. checking for missing values (EDA showed no missing values)
print("\n1.  Missing Values:")
missing_values = df_processed.isnull().sum()
if missing_values.sum() > 0:
    print(missing_values[missing_values > 0])
else:
    print("No missing values found (as expected from EDA)")

#2. check for duplicates
print("\n2. Duplicate Rows:")
duplicates = df_processed.duplicated().sum()
print(f"Number of duplicate rows: {duplicates}")
if duplicates > 0:
    print(f'Percentage of duplicates: {(duplicates/len(df_processed))* 100:.2sf}%')


# 3. Check skewness for variables identified in EDA as right-skewed
print("\n3. Skewness Analysis (EDA identified right-skewed variables):")    
skewed_vars = ['ApplicantIncome', 'CoapplicantIncome', 'loanAmount']
for var in skewed_vars:
    if var in df_processed.columns:
        skewness = skew(df_processed[var])
        print(f"{var}: skewness = {skewness:.3f} ({'right-skewed' if skewness > 0.5 else 'approximately normal'})")

# 4. check correlation with target (EDA evidence)
print("\n4.  correlation with Quality  (EDA Evidence):")
correlations = numerical.corr()['LoanAmount'].sort_values(key=abs, ascending=False)
print("High-signal features (|correlation| > 0.2):")
high_signal = correlations[abs(correlations) >  0.2].drop('Credit_History')
for feature, corr in high_signal.items9():
    print(f" {feature}: {corr:.3sf}")





1.  Missing Values:
No missing values found (as expected from EDA)

2. Duplicate Rows:
Number of duplicate rows: 0

3. Skewness Analysis (EDA identified right-skewed variables):
ApplicantIncome: skewness = 6.850 (right-skewed)
CoapplicantIncome: skewness = 7.402 (right-skewed)

4.  correlation with Quality  (EDA Evidence):
High-signal features (|correlation| > 0.2):


KeyError: "['Credit_History'] not found in axis"

In [17]:
df_processed.isnull().sum()

Gender               0
Married              0
Dependents           0
Education            0
Self_Employed        0
ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
Loan_Amount_Term     0
Credit_History       0
Property_Area        0
Loan_Status          0
dtype: int64