In [26]:
import streamlit as st
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler

from sklearn.model_selection import train_test_split

from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

In [27]:
df = pd.read_csv("dataset/loan_approval_data.csv")

In [11]:
df.head()
#df.info()
df.isnull().sum() # Total num of null features: 50

Applicant_ID          50
Applicant_Income      50
Coapplicant_Income    50
Employment_Status     50
Age                   50
Marital_Status        50
Dependents            50
Credit_Score          50
Existing_Loans        50
DTI_Ratio             50
Savings               50
Collateral_Value      50
Loan_Amount           50
Loan_Term             50
Loan_Purpose          50
Property_Area         50
Education_Level       50
Gender                50
Employer_Category     50
Loan_Approved         50
dtype: int64

## Handle Missing Values

In [28]:
categorical_cols = df.select_dtypes(include="object").columns # Columns with categorical data
nums_cols = df.select_dtypes(include="number").columns # Columns with numerical data

In [29]:
categorical_cols

Index(['Employment_Status', 'Marital_Status', 'Loan_Purpose', 'Property_Area',
       'Education_Level', 'Gender', 'Employer_Category', 'Loan_Approved'],
      dtype='object')

In [14]:
nums_cols

Index(['Applicant_ID', 'Applicant_Income', 'Coapplicant_Income', 'Age',
       'Dependents', 'Credit_Score', 'Existing_Loans', 'DTI_Ratio', 'Savings',
       'Collateral_Value', 'Loan_Amount', 'Loan_Term'],
      dtype='object')

In [30]:
num_imputer = SimpleImputer(strategy="mean") # Filling missing values in numerical columns with their mean value
df[nums_cols] = num_imputer.fit_transform(df[nums_cols])

In [31]:
categorical_imputer = SimpleImputer(strategy="most_frequent") # Filling missing values in categorical columns with their most frequently appearing value
df[categorical_cols] = categorical_imputer.fit_transform(df[categorical_cols])

In [15]:
df.isnull().sum() # All the null values as replaced

Applicant_ID          0
Applicant_Income      0
Coapplicant_Income    0
Employment_Status     0
Age                   0
Marital_Status        0
Dependents            0
Credit_Score          0
Existing_Loans        0
DTI_Ratio             0
Savings               0
Collateral_Value      0
Loan_Amount           0
Loan_Term             0
Loan_Purpose          0
Property_Area         0
Education_Level       0
Gender                0
Employer_Category     0
Loan_Approved         0
dtype: int64

## EDA : Exploratory Data Analysis

In [32]:
# Removing Applicant ID bcoz it's an unnecessary feature for our model training
df = df.drop("Applicant_ID", axis=1)

In [33]:
# Creating subplot for diff cols to detect outliers

fig, axes = plt.subplots(2, 3) 

sns.boxplot(ax=axes[0, 0], data=df, x="Loan_Approved", y="Coapplicant_Income")
sns.boxplot(ax=axes[0, 1], data=df, x="Loan_Approved", y="Credit_Score")
sns.boxplot(ax=axes[0, 2], data=df, x="Loan_Approved", y="DTI_Ratio")
sns.boxplot(ax=axes[1, 0], data=df, x="Loan_Approved", y="Savings")
sns.boxplot(ax=axes[1, 1], data=df, x="Loan_Approved", y="Age")
sns.boxplot(ax=axes[1, 2], data=df, x="Loan_Approved", y="Loan_Amount")

plt.tight_layout()

## Feature Encodiing

In [34]:
# Label Encoding : Converting Categorical Values into Numeric

le = LabelEncoder()

df["Education_Level"] = le.fit_transform(df["Education_Level"])
df["Loan_Approved"] = le.fit_transform(df["Loan_Approved"])

In [35]:
# One-Hot Encoding
cols = ["Employment_Status", "Marital_Status", "Loan_Purpose", "Property_Area", "Gender", "Employer_Category"]

ohe = OneHotEncoder(drop="first", sparse_output=False, handle_unknown="ignore") # (dropping 1st column, return an numpy array, ignoring error)

encoded = ohe.fit_transform(df[cols]) # Returns an 2D array of numerical vals for the categorical data.

encoded_df = pd.DataFrame(encoded, columns=ohe.get_feature_names_out(cols), index=df.index) # Converting the 2D array into a Data Frame.

df = pd.concat([df.drop(columns=cols), encoded_df], axis=1) # Concatinating the data frame with our original data and dropping the original cols.

In [None]:
ohe.get_feature_names_out() # Name of all columns after performing OHE

array(['Employment_Status_Salaried', 'Employment_Status_Self-employed',
       'Employment_Status_Unemployed', 'Marital_Status_Single',
       'Loan_Purpose_Car', 'Loan_Purpose_Education', 'Loan_Purpose_Home',
       'Loan_Purpose_Personal', 'Property_Area_Semiurban',
       'Property_Area_Urban', 'Gender_Male',
       'Employer_Category_Government', 'Employer_Category_MNC',
       'Employer_Category_Private', 'Employer_Category_Unemployed'],
      dtype=object)

In [None]:
df.info() # Now all categorical values are converted to numerical

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 28 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   Applicant_Income                 1000 non-null   float64
 1   Coapplicant_Income               1000 non-null   float64
 2   Age                              1000 non-null   float64
 3   Dependents                       1000 non-null   float64
 4   Credit_Score                     1000 non-null   float64
 5   Existing_Loans                   1000 non-null   float64
 6   DTI_Ratio                        1000 non-null   float64
 7   Savings                          1000 non-null   float64
 8   Collateral_Value                 1000 non-null   float64
 9   Loan_Amount                      1000 non-null   float64
 10  Loan_Term                        1000 non-null   float64
 11  Education_Level                  1000 non-null   int32  
 12  Loan_Approved        

## Correlation HeatMap