In [2]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor

from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

import mlflow
import mlflow.sklearn


In [3]:
df = pd.read_csv("emi_feature_engineered.csv")

X = df.drop(["emi_eligibility", "max_monthly_emi"], axis=1)
y = df["max_monthly_emi"]


In [4]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42
)


In [5]:
num_features = X.select_dtypes(include=["int64", "float64"]).columns
cat_features = X.select_dtypes(include=["object"]).columns

preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), num_features),
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_features)
    ]
)


In [5]:
mlflow.set_experiment("Max_EMI_Regression")


2026/01/14 14:27:15 INFO mlflow.store.db.utils: Creating initial MLflow database tables...
2026/01/14 14:27:15 INFO mlflow.store.db.utils: Updating database tables
2026/01/14 14:27:15 INFO alembic.runtime.migration: Context impl SQLiteImpl.
2026/01/14 14:27:15 INFO alembic.runtime.migration: Will assume non-transactional DDL.
2026/01/14 14:27:15 INFO alembic.runtime.migration: Context impl SQLiteImpl.
2026/01/14 14:27:15 INFO alembic.runtime.migration: Will assume non-transactional DDL.
2026/01/14 14:27:15 INFO mlflow.tracking.fluent: Experiment with name 'Max_EMI_Regression' does not exist. Creating a new experiment.


<Experiment: artifact_location='file:///e:/EMI_Predict/mlruns/2', creation_time=1768381035988, experiment_id='2', last_update_time=1768381035988, lifecycle_stage='active', name='Max_EMI_Regression', tags={}>

In [6]:
with mlflow.start_run(run_name="Linear_Regression"):
    model = Pipeline(steps=[
        ("preprocessor", preprocessor),
        ("regressor", LinearRegression())
    ])
    
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    
    mlflow.log_metric("rmse", rmse)
    mlflow.log_metric("mae", mean_absolute_error(y_test, y_pred))
    mlflow.log_metric("r2", r2_score(y_test, y_pred))
    
    mlflow.sklearn.log_model(model, "linear_regression_model")


The git executable must be specified in one of the following ways:
    - be included in your $PATH
    - be set via $GIT_PYTHON_GIT_EXECUTABLE
    - explicitly set via git.refresh(<full-path-to-git-executable>)

All git commands will error until this is rectified.

This initial message can be silenced or aggravated in the future by setting the
$GIT_PYTHON_REFRESH environment variable. Use one of the following values:
    - quiet|q|silence|s|silent|none|n|0: for no message or exception
    - error|e|exception|raise|r|2: for a raised exception

Example:
    export GIT_PYTHON_REFRESH=quiet



In [7]:
with mlflow.start_run(run_name="Random_Forest_Regressor"):
    model = Pipeline(steps=[
        ("preprocessor", preprocessor),
        ("regressor", RandomForestRegressor(
            n_estimators=200,
            max_depth=12,
            random_state=42
        ))
    ])
    
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    
    mlflow.log_metric("rmse", rmse)
    mlflow.log_metric("mae", mean_absolute_error(y_test, y_pred))
    mlflow.log_metric("r2", r2_score(y_test, y_pred))
    
    mlflow.sklearn.log_model(model, "random_forest_regressor")




In [10]:
num_cols = [
    "age",
    "monthly_salary",
    "years_of_employment",
    "monthly_rent",
    "family_size",
    "dependents",
    "school_fees",
    "college_fees",
    "travel_expenses",
    "groceries_utilities",
    "other_monthly_expenses",
    "current_emi_amount",
    "credit_score",
    "bank_balance",
    "emergency_fund",
    "requested_amount",
    "requested_tenure",
    "expense_to_income",
    "savings_ratio",
    "employment_stability_score"
]


In [11]:
cat_cols = [
    "gender",
    "marital_status",
    "education",
    "employment_type",
    "company_type",
    "house_type",
    "existing_loans",
    "emi_scenario",
    "credit_risk_bucket"
]


In [13]:
import joblib
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestRegressor

preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), num_cols),
        ("cat", OneHotEncoder(
            handle_unknown="ignore",
            sparse_output=False
        ), cat_cols)
    ],
    remainder="drop"
)

pipeline = Pipeline(
    steps=[
        ("preprocessing", preprocessor),
        ("model", RandomForestRegressor(
            n_estimators=300,
            random_state=42,
            n_jobs=-1,
            max_depth=None
        ))
    ]
)

pipeline.fit(X_train, y_train)

joblib.dump(pipeline, "models/emi_model.pkl")


['models/emi_model.pkl']