EDA and data cleaning

In [3]:
import os
import sys

In [4]:
def setup_project_path():
    """Set up the project path for imports and data access"""
    current_dir = os.getcwd()
    
    if 'jupyter_notebooks' in current_dir:
        os.chdir(os.path.dirname(current_dir))
        print("Changed to project root directory")
    else:
        print("No 'jupyter_notebooks' directory found the the current path")
    
    # Add project root to Python path for imports
    project_root = os.getcwd()
    if project_root not in sys.path:
        sys.path.insert(0, project_root)
    
    return os.getcwd()


Load Data

In [5]:
from pathlib import Path
import pandas as pd

root = setup_project_path()
file_path = Path(root) / "outputs" / "datasets" / "collection" / "loan_approval.csv"

if not file_path.exists():
    raise FileNotFoundError(f"Dataset not found at: {file_path}")

df = pd.read_csv(file_path).drop(['name'], axis=1)
df.head(3)


Changed to project root directory


Unnamed: 0,city,income,credit_score,loan_amount,years_employed,points,loan_approved
0,East Jill,113810,389,39698,27,50.0,False
1,New Jamesside,44592,729,15446,28,55.0,False
2,Lake Roberto,33278,584,11189,13,45.0,False


In [6]:
Data Exploration

SyntaxError: invalid syntax (4001574206.py, line 1)

In [7]:
%pip uninstall -y numpy
%pip install numpy==2.1
%pip install -U numba ydata-profiling


Found existing installation: numpy 2.1.0
Uninstalling numpy-2.1.0:
  Successfully uninstalled numpy-2.1.0
Note: you may need to restart the kernel to use updated packages.
Collecting numpy==2.1
  Using cached numpy-2.1.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
Using cached numpy-2.1.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (16.0 MB)
Installing collected packages: numpy
Successfully installed numpy-2.1.0
Note: you may need to restart the kernel to use updated packages.
Collecting numba
  Using cached numba-0.62.1-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (2.8 kB)
Collecting llvmlite<0.46,>=0.45.0dev0 (from numba)
  Using cached llvmlite-0.45.1-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (4.9 kB)
Note: you may need to restart the kernel to use updated packages.


In [8]:
from ydata_profiling import ProfileReport
pandas_report = ProfileReport(df=df, minimal=True)
pandas_report.to_notebook_iframe()

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

100%|██████████| 7/7 [00:00<00:00, 82.51it/s]


Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

In [10]:
approve_counts = df['loan_approved'].value_counts()
approve_counts

loan_approved
False    1121
True      879
Name: count, dtype: int64

In [12]:
percentage_approved = df['loan_approved'].value_counts(normalize=True) * 100
percentage_approved

loan_approved
False    56.05
True     43.95
Name: proportion, dtype: float64

There is a moderate difference imbalance between loan approvals with false having a higher proportion meaning some models may be biased towards the rejection of loan approvals. The variables are split by 5 numbers, text and a boolean.

Correlation study

In [14]:
%pip install feature-engine

Collecting feature-engine
  Downloading feature_engine-1.9.3-py3-none-any.whl.metadata (10 kB)
Downloading feature_engine-1.9.3-py3-none-any.whl (229 kB)
Installing collected packages: feature-engine
Successfully installed feature-engine-1.9.3
Note: you may need to restart the kernel to use updated packages.


In [16]:
from feature_engine.encoding import OneHotEncoder
encoder = OneHotEncoder(variables=df.columns[df.dtypes=='object'].to_list(), drop_last=False)
df_ohe = encoder.fit_transform(df)
print(df_ohe.shape)
df_ohe.head(10)

(2000, 1888)


Unnamed: 0,income,credit_score,loan_amount,years_employed,points,loan_approved,city_East Jill,city_New Jamesside,city_Lake Roberto,city_West Melanieview,...,city_South Kenneth,city_Cookshire,city_Jenniferstad,city_Basston,city_Tylertown,city_Robertton,city_New Frank,city_East Haley,city_Adamland,city_New Nathantown
0,113810,389,39698,27,50.0,False,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,44592,729,15446,28,55.0,False,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,33278,584,11189,13,45.0,False,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3,127196,344,48823,29,50.0,False,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
4,66048,496,47174,4,25.0,False,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,62098,689,19217,29,65.0,True,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,59256,373,40920,40,35.0,False,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,48289,524,45866,20,25.0,False,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,126530,367,14826,36,55.0,False,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,43434,446,18359,8,20.0,False,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [18]:
corr_spearman = df_ohe.corr(method='spearman')['loan_approved'].sort_values(key=abs, ascending=False)[1:].head(10)
corr_spearman

points                   0.862704
credit_score             0.713124
income                   0.237363
loan_amount             -0.158243
years_employed           0.104049
city_Stevenside          0.035729
city_Michelletown        0.035729
city_East Jennifer       0.035729
city_Port John           0.035729
city_West Christopher    0.035729
Name: loan_approved, dtype: float64

In [19]:
corr_pearson = df_ohe.corr(method='pearson')['loan_approved'].sort_values(key=abs, ascending=False)[1:].head(10)
corr_pearson

points                   0.821415
credit_score             0.715788
income                   0.238066
loan_amount             -0.157859
years_employed           0.104408
city_Elizabethland       0.035729
city_Port John           0.035729
city_Michelletown        0.035729
city_Port Michaeltown    0.035729
city_North Jeffery       0.035729
Name: loan_approved, dtype: float64

Spearman correlation measures monotonic relationships, in which variables move in the same direction but not nexessarily lineear.
Pearson method measures linear relationships, in which 1 is a positive correlation and -1 is a negative correlation.
In the spearman correlation the variable points and credit score have a strong positive correlation suggesting as points increase so does loan approval.
income and loan_amount had a weak correlation and years_employed had a very weak correlation. This would suggest a higher income and lower loan amount would increase loan approval with more year employed slightly improving chances of approval.
The city variable has an extremely weak correlation and has little effect on loan approval. 

Pearson correlation showed a similiar pattern where points and credit score had a strong positive correlation and income had a very weak correlation.

In [21]:
vars_to_study = ['points', 'credit_score', 'income', 'loan_amount', 'years_employed']
vars_to_study

['points', 'credit_score', 'income', 'loan_amount', 'years_employed']