In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

In [None]:
df = pd.read_csv("loan_approval_data.csv")

In [None]:
df.head()
df.info()
df.isnull().sum()
df.describe()

# Handle Missing Values

In [None]:
categorical_cols = df.select_dtypes(include = ["object"]).columns
numerical_cols = df.select_dtypes(include = ["number"]).columns

In [None]:
categorical_cols.size + numerical_cols.size

In [None]:
from sklearn.impute import SimpleImputer

num_imp = SimpleImputer(strategy = "mean")
df[numerical_cols] = num_imp.fit_transform(df[numerical_cols])


In [None]:
cat_imp = SimpleImputer(strategy = "most_frequent")
df[categorical_cols] = cat_imp.fit_transform(df[categorical_cols])

In [None]:
df = df.drop("Applicant_ID", axis = 1)
df.head()
df.isnull().sum()

# EDA

In [None]:
# How balanced our classes are?

classes_count = df["Loan_Approved"].value_counts()
plt.pie(classes_count, labels = ["No", "Yes"], autopct = "%1.1f%%")
plt.title("IS Loan Approved or not?")

In [None]:
# Analyze Categories
gender_cnt = df["Gender"].value_counts()
ax = sns.barplot(gender_cnt)
ax.bar_label(ax.containers[0])

In [None]:
edu_cnt = df["Education_Level"].value_counts()
ax = sns.barplot(edu_cnt)
ax.bar_label(ax.containers[0])

In [None]:
# Analyze Income

sns.histplot(
    data = df,
    x = "Applicant_Income",
    bins = 20
)

In [None]:
# Analyze Income

sns.histplot(
    data = df,
    x = "Coapplicant_Income",
    bins = 20
)

In [None]:
# Outliers - box plots

sns.boxplot(
    data = df,
    x = "Loan_Approved",
    y = "Applicant_Income"
)

In [None]:
fig, axes = plt.subplots(2,2)
sns.boxplot(ax = axes[0, 0], data = df, x = "Loan_Approved", y = "Applicant_Income")
sns.boxplot(ax = axes[0, 1], data = df, x = "Loan_Approved", y = "Credit_Score")
sns.boxplot(ax = axes[1, 0], data = df, x = "Loan_Approved", y = "DTI_Ratio")
sns.boxplot(ax = axes[1, 1], data = df, x = "Loan_Approved", y = "Savings")
plt.tight_layout()

In [None]:
# Credit Score with Loan Apporval

sns.histplot(
    data = df,
    x = "Credit_Score",
    bins = 20,
    hue = "Loan_Approved",
    multiple = "dodge"
)

In [None]:
# Credit Score with Loan Apporval

sns.histplot(
    data = df,
    x = "Applicant_Income",
    bins = 20,
    hue = "Loan_Approved",
    multiple = "dodge"
)

In [None]:
df.head()


# Encoding

In [None]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
# LabelEncoding
le = LabelEncoder()
df["Education_level"] = le.fit_transform(df["Education_Level"])
df["Loan_Approved"] = le.fit_transform(df["Loan_Approved"])

In [None]:
df.head()

In [None]:
#One Hot Encoding

cols = ["Employment_Status", "Marital_Status", "Loan_Purpose", "Property_Area", "Gender", "Employer_Category"]

ohe = OneHotEncoder(drop = "first", sparse_output = False, handle_unknown = "ignore")

encoded = ohe.fit_transform(df[cols])

encoded_df = pd.DataFrame(encoded, columns= ohe.get_feature_names_out(cols), index = df.index )

In [None]:
df = pd.concat([df.drop(columns = cols), encoded_df], axis = 1)

In [None]:
df.head()
df.info()


In [None]:
df.columns

# Correlation Heatmap


In [None]:
num_cols = df.select_dtypes(include = "number")
corr_mat = num_cols.corr()

In [None]:
corr_mat.corr()["Loan_Approved"].sort_values(ascending = False)

In [None]:
plt.figure(figsize = (15,8))
sns.heatmap(
    corr_mat,
    annot = True,
    fmt = ".2f",
    cmap = "coolwarm"
)