# Data Preprocessing & EDA
### Loan Default Prediction Project

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

## 1. Data Loading & Initial Inspection

In [None]:
df = pd.read_csv("../loan_data.csv")
print(f"Shape: {df.shape}")
print()
print("Null values:")
print(df.isnull().sum())
print()
print(f"Duplicates: {df.duplicated().sum()}")
df.head()

In [None]:
df.describe()

## 2. Exploratory Data Analysis (EDA)

In [None]:
fig, ax = plt.subplots(figsize=(6, 4))
counts = df["loan_status"].value_counts()
ax.bar(["No Default (0)", "Default (1)"], counts.values, color=["skyblue", "salmon"])
ax.set_title("Target Class Distribution")
ax.set_ylabel("Count")
plt.tight_layout()
plt.show()

In [None]:
numeric_cols = ["person_age", "person_income", "person_emp_exp",
                "loan_amnt", "loan_int_rate", "loan_percent_income",
                "cb_person_cred_hist_length", "credit_score"]

fig, axes = plt.subplots(2, 4, figsize=(16, 8))
axes = axes.flatten()

for i, col in enumerate(numeric_cols):
    axes[i].hist(df[df["loan_status"] == 0][col], bins=30, alpha=0.6, label="No Default", color="skyblue")
    axes[i].hist(df[df["loan_status"] == 1][col], bins=30, alpha=0.6, label="Default", color="salmon")
    axes[i].set_title(col)
    axes[i].legend(fontsize=7)

fig.suptitle("Numeric Feature Distributions by Loan Status", fontsize=14)
plt.tight_layout()
plt.show()

In [None]:
cat_cols = ["person_gender", "person_education", "person_home_ownership",
            "loan_intent", "previous_loan_defaults_on_file"]

fig, axes = plt.subplots(1, len(cat_cols), figsize=(20, 5))

for i, col in enumerate(cat_cols):
    default_rate = df.groupby(col)["loan_status"].mean().sort_values(ascending=False)
    axes[i].bar(default_rate.index, default_rate.values, color="steelblue")
    axes[i].set_title(col)
    axes[i].set_ylabel("Default Rate")
    axes[i].tick_params(axis="x", rotation=30)

fig.suptitle("Default Rate by Categorical Feature", fontsize=14)
plt.tight_layout()
plt.show()

In [None]:
fig, ax = plt.subplots(figsize=(10, 7))
corr = df[numeric_cols + ["loan_status"]].corr()
sns.heatmap(corr, annot=True, fmt=".2f", cmap="coolwarm", ax=ax)
ax.set_title("Feature Correlation Matrix")
plt.tight_layout()
plt.show()

## 3. Data Cleaning

In [None]:
df = df.dropna()
print(f"Shape after dropping nulls: {df.shape}")

for col in ["person_age", "person_income", "person_emp_exp"]:
    cap = df[col].quantile(0.99)
    df = df[df[col] <= cap]
print(f"Shape after outlier removal: {df.shape}")

In [None]:
df["previous_loan_defaults_on_file"] = df["previous_loan_defaults_on_file"].map({"Yes": 1, "No": 0})
df["person_gender"] = df["person_gender"].map({"male": 1, "female": 0})

print("Binary encoding done.")
df[["person_gender", "previous_loan_defaults_on_file"]].value_counts()

## 4. Train-Test Split & Preprocessing Preview

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, OneHotEncoder

X = df.drop(columns=["loan_status"])
y = df["loan_status"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(f"Train: {X_train.shape}  |  Test: {X_test.shape}")
print(f"Class balance - 0: {(y==0).sum()}, 1: {(y==1).sum()}")