In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    classification_report,
    roc_auc_score
)

import mlflow
import mlflow.sklearn


In [2]:
df = pd.read_csv("emi_feature_engineered.csv")

X = df.drop(["emi_eligibility", "max_monthly_emi"], axis=1)
y = df["emi_eligibility"]


In [3]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    stratify=y,
    random_state=42
)


In [4]:
num_features = X.select_dtypes(include=["int64", "float64"]).columns
cat_features = X.select_dtypes(include=["object"]).columns

preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), num_features),
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_features)
    ]
)


In [5]:
mlflow.set_experiment("EMI_Eligibility_Classification")


2026/01/18 16:11:12 INFO mlflow.store.db.utils: Creating initial MLflow database tables...
2026/01/18 16:11:12 INFO mlflow.store.db.utils: Updating database tables
2026/01/18 16:11:12 INFO alembic.runtime.migration: Context impl SQLiteImpl.
2026/01/18 16:11:12 INFO alembic.runtime.migration: Will assume non-transactional DDL.
2026/01/18 16:11:12 INFO alembic.runtime.migration: Context impl SQLiteImpl.
2026/01/18 16:11:12 INFO alembic.runtime.migration: Will assume non-transactional DDL.


<Experiment: artifact_location='file:///e:/EMI_Predict/mlruns/1', creation_time=1768368636089, experiment_id='1', last_update_time=1768368636089, lifecycle_stage='active', name='EMI_Eligibility_Classification', tags={'mlflow.experimentKind': 'custom_model_development'}>

In [7]:
with mlflow.start_run(run_name="Logistic_Regression"):
    model = Pipeline(steps=[
        ("preprocessor", preprocessor),
        ("classifier", LogisticRegression(max_iter=1000))
    ])
    
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    mlflow.log_metric("accuracy", accuracy_score(y_test, y_pred))
    mlflow.log_metric("f1_weighted", f1_score(y_test, y_pred, average="weighted"))
    
    mlflow.sklearn.log_model(model, "logistic_regression_model")




In [8]:
with mlflow.start_run(run_name="Random_Forest"):
    model = Pipeline(steps=[
        ("preprocessor", preprocessor),
        ("classifier", RandomForestClassifier(
            n_estimators=200,
            max_depth=10,
            random_state=42
        ))
    ])
    
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    mlflow.log_metric("accuracy", accuracy_score(y_test, y_pred))
    mlflow.log_metric("f1_weighted", f1_score(y_test, y_pred, average="weighted"))
    
    mlflow.sklearn.log_model(model, "random_forest_model")




In [9]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y_encoded,
    test_size=0.2,
    stratify=y_encoded,
    random_state=42
)


In [10]:
with mlflow.start_run(run_name="XGBoost"):
    model = Pipeline(steps=[
        ("preprocessor", preprocessor),
        ("classifier", XGBClassifier(
            n_estimators=300,
            max_depth=6,
            learning_rate=0.05,
            subsample=0.8,
            colsample_bytree=0.8,
            eval_metric="mlogloss",
            random_state=42
        ))
    ])
    
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    mlflow.log_metric("accuracy", accuracy_score(y_test, y_pred))
    mlflow.log_metric("f1_weighted", f1_score(y_test, y_pred, average="weighted"))
    
    mlflow.sklearn.log_model(model, "xgboost_model")




In [5]:
import joblib
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier

# Recreate feature lists (DO NOT rely on state)
num_features = X.select_dtypes(include=["int64", "float64"]).columns.tolist()
cat_features = X.select_dtypes(include=["object"]).columns.tolist()

preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), num_features),
        ("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=False), cat_features)
    ]
)

final_model = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        ("classifier", RandomForestClassifier(
            n_estimators=300,
            max_depth=10,
            random_state=42,
            n_jobs=-1,
            class_weight="balanced"
        ))
    ]
)

final_model.fit(X_train, y_train)

joblib.dump(final_model, "models/emi_eligibility_classifier.pkl")


['models/emi_eligibility_classifier.pkl']

In [6]:
model = joblib.load("models/emi_eligibility_classifier.pkl")

sample = X_train.iloc[[0]]
print("Prediction:", model.predict(sample))
print("Probability:", model.predict_proba(sample))


Prediction: ['Eligible']
Probability: [[0.54681473 0.3418478  0.11133746]]
