In [10]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [11]:
df = pd.read_csv("emi_eda_ready.csv")
df.shape


(404800, 27)

In [12]:
num_cols = df.select_dtypes(include=["int64", "float64"]).columns
cat_cols = df.select_dtypes(include=["object"]).columns

df[num_cols] = df[num_cols].fillna(df[num_cols].median())
df[cat_cols] = df[cat_cols].fillna(df[cat_cols].mode().iloc[0])


In [16]:
df["bank_balance"] = pd.to_numeric(df["bank_balance"], errors="coerce")
df["bank_balance"] = df["bank_balance"].fillna(df["bank_balance"].median())


In [17]:
df[["monthly_salary", "current_emi_amount", "bank_balance"]].dtypes


monthly_salary        float64
current_emi_amount    float64
bank_balance          float64
dtype: object

In [18]:
df["savings_ratio"] = df["bank_balance"] / df["monthly_salary"]


In [19]:
# Total monthly expenses
df["total_monthly_expenses"] = (
    df["school_fees"] +
    df["college_fees"] +
    df["travel_expenses"] +
    df["groceries_utilities"] +
    df["other_monthly_expenses"] +
    df["monthly_rent"]
)

# Replace zero or negative salary to avoid division errors
df["monthly_salary"] = df["monthly_salary"].replace(0, np.nan)

# Financial ratios
df["debt_to_income"] = df["current_emi_amount"] / df["monthly_salary"]
df["expense_to_income"] = df["total_monthly_expenses"] / df["monthly_salary"]
df["savings_ratio"] = df["bank_balance"] / df["monthly_salary"]

# Handle infinite and missing values generated by ratios
df.replace([np.inf, -np.inf], np.nan, inplace=True)

df[[
    "debt_to_income",
    "expense_to_income",
    "savings_ratio"
]] = df[[
    "debt_to_income",
    "expense_to_income",
    "savings_ratio"
]].fillna(0)


In [20]:
df["credit_risk_bucket"] = pd.cut(
    df["credit_score"],
    bins=[300, 580, 670, 740, 850],
    labels=["Poor", "Fair", "Good", "Excellent"]
)

df["employment_stability_score"] = np.where(
    df["years_of_employment"] >= 5, 2,
    np.where(df["years_of_employment"] >= 2, 1, 0)
)


In [23]:
from sklearn.preprocessing import OneHotEncoder

categorical_features = [
    "gender",
    "marital_status",
    "education",
    "employment_type",
    "house_type",
    "emi_scenario",
    "credit_risk_bucket"
]


In [24]:
X = df.drop(["emi_eligibility", "max_monthly_emi"], axis=1)
y_class = df["emi_eligibility"]
y_reg = df["max_monthly_emi"]


In [25]:
df.to_csv("emi_feature_engineered.csv", index=False)
