In [11]:
import os
import kagglehub
import pandas as pd
import joblib

from sklearn.preprocessing import PowerTransformer, RobustScaler, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

from imblearn.over_sampling import SMOTE

In [12]:
import mlflow
mlflow.set_tracking_uri("http://localhost:8080")
mlflow.set_experiment("Model Comparisions")

2025/07/01 15:12:03 INFO mlflow.tracking.fluent: Experiment with name 'Model Comparisions' does not exist. Creating a new experiment.


<Experiment: artifact_location='mlflow-artifacts:/896692842864941115', creation_time=1751362923416, experiment_id='896692842864941115', last_update_time=1751362923416, lifecycle_stage='active', name='Model Comparisions', tags={}>

In [13]:

# Save dir
import os
SAVEDIR = os.getenv('ARTIFACT_DIR', '.') + '/saved_models'
os.makedirs(SAVEDIR, exist_ok=True)

def load_data():
    path = kagglehub.dataset_download("itsmesunil/bank-loan-modelling")
    df = pd.read_excel(
        os.path.join(path, "Bank_Personal_Loan_Modelling.xlsx"),
        sheet_name='Data'
    )
    # DROP via keyword axis=
    return df.drop(['ID', 'ZIP Code'], axis=1)


In [14]:

def split_data(df):
    X = df.drop('Personal Loan', axis=1)
    y = df['Personal Loan']
    X0, X_test, y0, y_test = train_test_split(
        X, y, test_size=0.3, stratify=y, random_state=42
    )
    X_train, X_val, y_train, y_val = train_test_split(
        X0, y0, test_size=0.2, stratify=y0, random_state=42
    )
    return X_train, X_val, X_test, y_train, y_val, y_test


In [15]:

def preprocess_fit(X_train, X_val, X_test):
    rb = ['CCAvg','Mortgage']
    st = ['Income','Experience','Age']
    pt = PowerTransformer('yeo-johnson')
    rs = RobustScaler()
    ss = StandardScaler()

    # fit & transform train
    X_train[rb] = rs.fit_transform(pt.fit_transform(X_train[rb]))
    X_train[st] = ss.fit_transform(X_train[st])

    # transform val/test
    for X in (X_val, X_test):
        X[rb] = rs.transform(pt.transform(X[rb]))
        X[st] = ss.transform(X[st])

    # save transformers
    joblib.dump(pt, os.path.join(SAVEDIR, 'pt.pkl'))
    joblib.dump(rs, os.path.join(SAVEDIR, 'rs.pkl'))
    joblib.dump(ss, os.path.join(SAVEDIR, 'ss.pkl'))

    return X_train, X_val, X_test


In [16]:

def feature_select_fit(X_train, y_train, X_val, X_test):
    selector = RFE(LogisticRegression(max_iter=1000), n_features_to_select=8)
    Xtr = selector.fit_transform(X_train, y_train)
    Xv  = selector.transform(X_val)
    Xt  = selector.transform(X_test)
    joblib.dump(selector, os.path.join(SAVEDIR, 'selector.pkl'))
    return Xtr, Xv, Xt


In [17]:

def balance(X, y):
    sm = SMOTE(random_state=42)
    return sm.fit_resample(X, y)


In [18]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def tune_and_save(X, y, X_val, y_val):
    mlflow.set_experiment("Bank Loan Classification")

    grids = {
        'LogisticRegression': {
            'model': LogisticRegression(max_iter=1000),
            'params': {'C':[0.01,0.1,1,10], 'penalty':['l1','l2'], 'solver':['liblinear']}
        },
        'DecisionTree': {
            'model': DecisionTreeClassifier(),
            'params': {'max_depth':[3,5,7,None], 'min_samples_split':[2,5,10], 'min_samples_leaf':[1,2,4]}
        },
        'RandomForest': {
            'model': RandomForestClassifier(random_state=42),
            'params': {'n_estimators':[50,100], 'max_depth':[5,10,None]}
        },
        'GradientBoosting': {
            'model': GradientBoostingClassifier(random_state=42),
            'params': {'n_estimators':[50,100], 'learning_rate':[0.01,0.1]}
        },
        'KNN': {
            'model': KNeighborsClassifier(),
            'params': {'n_neighbors':[3,5,7]}
        },
        'SVM': {
            'model': SVC(probability=True, random_state=42),
            'params': {'C':[0.1,1,10], 'kernel':['linear','rbf']}
        }
    }

    for name, cfg in grids.items():
        with mlflow.start_run(run_name=name, nested=True):
            mlflow.sklearn.autolog()

            gs = GridSearchCV(cfg['model'], cfg['params'], scoring='f1', cv=5, n_jobs=-1)
            gs.fit(X, y)
            best_model = gs.best_estimator_

            # Save model
            model_path = os.path.join(SAVEDIR, f"{name}_model.pkl")
            joblib.dump(best_model, model_path)
            mlflow.sklearn.log_model(best_model, artifact_path=name + "_model")

            # Predict on validation set
            preds = best_model.predict(X_val)

            # Compute metrics
            acc = accuracy_score(y_val, preds)
            prec = precision_score(y_val, preds)
            rec = recall_score(y_val, preds)
            f1 = f1_score(y_val, preds)

            # Log metrics to MLflow
            mlflow.log_metrics({
                "val_accuracy": acc,
                "val_precision": prec,
                "val_recall": rec,
                "val_f1": f1
            })

            print(f"{name} tuned → {gs.best_params_}")
            print(f"→ Accuracy: {acc:.3f}, Precision: {prec:.3f}, Recall: {rec:.3f}, F1: {f1:.3f}")


In [19]:
def main():
    mlflow.set_experiment("Bank Loan Classification")

    with mlflow.start_run(run_name="Preprocessing and Tuning"):
        df = load_data()
        Xtr, Xv, Xt, ytr, yv, yt = split_data(df)
        Xtr, Xv, Xt = preprocess_fit(Xtr, Xv, Xt)
        Xtf, Xvf, Xsf = feature_select_fit(Xtr, ytr, Xv, Xt)

        # Log preprocessing artifacts
        for file in ['pt.pkl', 'rs.pkl', 'ss.pkl', 'selector.pkl']:
            mlflow.log_artifact(os.path.join(SAVEDIR, file))

        Xb, yb = balance(Xtf, ytr)
        tune_and_save(Xb, yb, Xvf, yv)


In [20]:
if __name__=='__main__':
    main()

2025/07/01 15:12:03 INFO mlflow.tracking.fluent: Experiment with name 'Bank Loan Classification' does not exist. Creating a new experiment.
2025/07/01 15:12:25 INFO mlflow.sklearn.utils: Logging the 5 best runs, 3 runs will be omitted.


LogisticRegression tuned → {'C': 0.1, 'penalty': 'l2', 'solver': 'liblinear'}
→ Accuracy: 0.884, Precision: 0.446, Recall: 0.866, F1: 0.589
🏃 View run LogisticRegression at: http://localhost:8080/#/experiments/480529433569303561/runs/cd8b478943d94c7a96483940b4a72f3e
🧪 View experiment at: http://localhost:8080/#/experiments/480529433569303561


2025/07/01 15:12:38 INFO mlflow.sklearn.utils: Logging the 5 best runs, 31 runs will be omitted.


DecisionTree tuned → {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2}
→ Accuracy: 0.976, Precision: 0.903, Recall: 0.836, F1: 0.868
🏃 View run DecisionTree at: http://localhost:8080/#/experiments/480529433569303561/runs/f2f8a904a6f34988b3cf75166cddc6a4
🧪 View experiment at: http://localhost:8080/#/experiments/480529433569303561


2025/07/01 15:12:51 INFO mlflow.sklearn.utils: Logging the 5 best runs, one run will be omitted.


RandomForest tuned → {'max_depth': None, 'n_estimators': 100}
→ Accuracy: 0.983, Precision: 0.982, Recall: 0.836, F1: 0.903
🏃 View run RandomForest at: http://localhost:8080/#/experiments/480529433569303561/runs/118ea6ce085145cfb325957ae38d438c
🧪 View experiment at: http://localhost:8080/#/experiments/480529433569303561


2025/07/01 15:13:05 INFO mlflow.sklearn.utils: Logging the 5 best runs, no runs will be omitted.


GradientBoosting tuned → {'learning_rate': 0.1, 'n_estimators': 100}
→ Accuracy: 0.980, Precision: 0.934, Recall: 0.851, F1: 0.891
🏃 View run GradientBoosting at: http://localhost:8080/#/experiments/480529433569303561/runs/e2566eb41aa847918813884eb028a1dc
🧪 View experiment at: http://localhost:8080/#/experiments/480529433569303561


2025/07/01 15:13:18 INFO mlflow.sklearn.utils: Logging the 5 best runs, no runs will be omitted.


KNN tuned → {'n_neighbors': 3}
→ Accuracy: 0.971, Precision: 0.873, Recall: 0.821, F1: 0.846
🏃 View run KNN at: http://localhost:8080/#/experiments/480529433569303561/runs/44479292bc2c42f58fde217d61997457
🧪 View experiment at: http://localhost:8080/#/experiments/480529433569303561


2025/07/01 15:13:36 INFO mlflow.sklearn.utils: Logging the 5 best runs, one run will be omitted.


SVM tuned → {'C': 10, 'kernel': 'rbf'}
→ Accuracy: 0.977, Precision: 0.918, Recall: 0.836, F1: 0.875
🏃 View run SVM at: http://localhost:8080/#/experiments/480529433569303561/runs/dc4d3931be524030aab1f7fab97a862f
🧪 View experiment at: http://localhost:8080/#/experiments/480529433569303561
🏃 View run Preprocessing and Tuning at: http://localhost:8080/#/experiments/480529433569303561/runs/407de987c017453dab80db685e48288c
🧪 View experiment at: http://localhost:8080/#/experiments/480529433569303561
