In [18]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix
from imblearn.under_sampling import RandomUnderSampler
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC


def build_and_evaluate_pipeline(data: pd.DataFrame, target_column: str, model, param_grid: dict):
    """
    Build, balance, train, and evaluate a machine learning pipeline using GridSearchCV.
    
    Parameters:
    - data: pandas DataFrame containing features and target.
    - target_column: name of the target column.
    - model: any scikit-learn compatible estimator.
    - param_grid: dict, parameter grid for GridSearchCV.
    """
    # 1. Separate features and target
    y = data['loan_status']
    X = data.drop(columns='loan_status')

    # 2. Perform class balancing by undersampling
    rus = RandomUnderSampler(random_state=42)
    X_resampled, y_resampled = rus.fit_resample(X, y)

    print(" Class distribution after undersampling:")
    print(y_resampled.value_counts())

    # 3. Split data
    X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

    # 4. Identify feature types
    numerical_features = X.select_dtypes(include=["int64", "float64"]).columns.tolist()
    categorical_features = X.select_dtypes(include=["object", "category"]).columns.tolist()

    # 5. Define transformers
    numeric_transformer = Pipeline([
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', MinMaxScaler())
    ])

    categorical_transformer = Pipeline([
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown="ignore"))
    ])

    # 6. Column transformer
    preprocessor = ColumnTransformer([
        ("num", numeric_transformer, numerical_features),
        ("cat", categorical_transformer, categorical_features)
    ])

    # 7. Full pipeline
    pipeline = Pipeline([
        ("preprocessor", preprocessor),
        ("classifier", model)
    ])

    # 8. GridSearchCV
    grid = GridSearchCV(estimator=pipeline, param_grid=param_grid, cv=5, n_jobs=-1, scoring='accuracy')
    grid.fit(X_train, y_train)

    print(f" Best Parameters for {model.__class__.__name__}: {grid.best_params_}")
    print(f" Best Cross-Validation Score: {grid.best_score_:.4f}")

    # 9. Predict and evaluate
    y_pred = grid.best_estimator_.predict(X_test)
    print(f" {model.__class__.__name__} Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))
    print(f" {model.__class__.__name__} Classification Report:")
    print(classification_report(y_test, y_pred))

    return grid.best_estimator_


# Load your data
df = pd.read_csv("loan_data.csv")

# Define parameter grids
rf_grid = {
    "classifier__n_estimators": [100, 200],
    "classifier__max_depth": [None, 10, 20],
    "classifier__min_samples_split": [2, 5]
}

lr_grid = {
    "classifier__C": [0.1, 1, 10],
    "classifier__penalty": ["l2"],
    "classifier__solver": ["liblinear"]
}

svc_grid = {
    "classifier__C": [0.1, 1, 10],
    "classifier__kernel": ["linear", "rbf"]
}

# Run for each model
pipeline_rf = build_and_evaluate_pipeline(df, "loan_status", RandomForestClassifier(random_state=42), rf_grid)
pipeline_lr = build_and_evaluate_pipeline(df, "loan_status", LogisticRegression(max_iter=1000), lr_grid)
pipeline_svm = build_and_evaluate_pipeline(df, "loan_status", SVC(), svc_grid)


 Class distribution after undersampling:
loan_status
0    10000
1    10000
Name: count, dtype: int64
 Best Parameters for RandomForestClassifier: {'classifier__max_depth': None, 'classifier__min_samples_split': 2, 'classifier__n_estimators': 200}
 Best Cross-Validation Score: 0.8989
 RandomForestClassifier Confusion Matrix:
[[1784  235]
 [ 145 1836]]
 RandomForestClassifier Classification Report:
              precision    recall  f1-score   support

           0       0.92      0.88      0.90      2019
           1       0.89      0.93      0.91      1981

    accuracy                           0.91      4000
   macro avg       0.91      0.91      0.90      4000
weighted avg       0.91      0.91      0.90      4000

 Class distribution after undersampling:
loan_status
0    10000
1    10000
Name: count, dtype: int64
 Best Parameters for LogisticRegression: {'classifier__C': 10, 'classifier__penalty': 'l2', 'classifier__solver': 'liblinear'}
 Best Cross-Validation Score: 0.8771
 Logisti