**Import Libraries**

In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_predict
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import (
    classification_report, confusion_matrix,
    precision_score, recall_score, f1_score,
    precision_recall_curve, average_precision_score
)

from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline

**Download and Load Dataset**

In [2]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("architsharma01/loan-approval-prediction-dataset")

print("Path to dataset files:", path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/architsharma01/loan-approval-prediction-dataset?dataset_version_number=1...


100%|██████████| 80.6k/80.6k [00:00<00:00, 366kB/s]

Extracting files...
Path to dataset files: /root/.cache/kagglehub/datasets/architsharma01/loan-approval-prediction-dataset/versions/1





In [3]:
file_path = os.path.join(path, "loan_approval_dataset.csv")


df = pd.read_csv(file_path)

print("data shape", df.shape)
print(df.head())

data shape (4269, 13)
   loan_id   no_of_dependents      education  self_employed   income_annum  \
0        1                  2       Graduate             No        9600000   
1        2                  0   Not Graduate            Yes        4100000   
2        3                  3       Graduate             No        9100000   
3        4                  3       Graduate             No        8200000   
4        5                  5   Not Graduate            Yes        9800000   

    loan_amount   loan_term   cibil_score   residential_assets_value  \
0      29900000          12           778                    2400000   
1      12200000           8           417                    2700000   
2      29700000          20           506                    7100000   
3      30700000           8           467                   18200000   
4      24200000          20           382                   12400000   

    commercial_assets_value   luxury_assets_value   bank_asset_value  \
0   

In [4]:
print(df.info())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4269 entries, 0 to 4268
Data columns (total 13 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   loan_id                    4269 non-null   int64 
 1    no_of_dependents          4269 non-null   int64 
 2    education                 4269 non-null   object
 3    self_employed             4269 non-null   object
 4    income_annum              4269 non-null   int64 
 5    loan_amount               4269 non-null   int64 
 6    loan_term                 4269 non-null   int64 
 7    cibil_score               4269 non-null   int64 
 8    residential_assets_value  4269 non-null   int64 
 9    commercial_assets_value   4269 non-null   int64 
 10   luxury_assets_value       4269 non-null   int64 
 11   bank_asset_value          4269 non-null   int64 
 12   loan_status               4269 non-null   object
dtypes: int64(10), object(3)
memory usage: 433.7+ KB
None


In [5]:
print(df.columns.tolist())

['loan_id', ' no_of_dependents', ' education', ' self_employed', ' income_annum', ' loan_amount', ' loan_term', ' cibil_score', ' residential_assets_value', ' commercial_assets_value', ' luxury_assets_value', ' bank_asset_value', ' loan_status']


## Clean Column

In [6]:
df.columns = df.columns.str.strip()

print(df.columns.tolist())


['loan_id', 'no_of_dependents', 'education', 'self_employed', 'income_annum', 'loan_amount', 'loan_term', 'cibil_score', 'residential_assets_value', 'commercial_assets_value', 'luxury_assets_value', 'bank_asset_value', 'loan_status']


In [7]:
print(df["loan_status"].unique())


[' Approved' ' Rejected']


In [8]:
target_col = "loan_status"


df[target_col] = df[target_col].str.strip().str.lower()

print(df[target_col].unique())


df[target_col] = df[target_col].map({
    "approved": 1,
    "rejected": 0
}).astype(int)

y = df[target_col]
X = df.drop(columns=[target_col, "loan_id"])


['approved' 'rejected']


In [9]:
cat_cols = X.select_dtypes(include=["object"]).columns.tolist()
num_cols = X.select_dtypes(include=[np.number]).columns.tolist()

print(" Numeric columns:", num_cols)
print(" Categorical columns:", cat_cols)


 Numeric columns: ['no_of_dependents', 'income_annum', 'loan_amount', 'loan_term', 'cibil_score', 'residential_assets_value', 'commercial_assets_value', 'luxury_assets_value', 'bank_asset_value']
 Categorical columns: ['education', 'self_employed']


## Preprocessing Pipelines

In [10]:
num_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

cat_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
])


preprocessor = ColumnTransformer([
    ("num", num_pipeline, num_cols),
    ("cat", cat_pipeline, cat_cols)
])


## Train-Test Split


In [14]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

print("Train size:", X_train.shape, " Test size:", X_test.shape)

Train size: (3415, 11)  Test size: (854, 11)


## Define Models

In [12]:
models = {
    "Logistic Regression": Pipeline([
        ("preproc", preprocessor),
        ("clf", LogisticRegression(max_iter=1000, class_weight="balanced", random_state=42))
    ]),
    "Decision Tree": Pipeline([
        ("preproc", preprocessor),
        ("clf", DecisionTreeClassifier(class_weight="balanced", random_state=42))
    ]),
    "Logistic + SMOTE": ImbPipeline([
        ("preproc", preprocessor),
        ("smote", SMOTE(random_state=42)),
        ("clf", LogisticRegression(max_iter=1000, random_state=42))
    ]),
    "Random Forest": Pipeline([
        ("preproc", preprocessor),
        ("clf", RandomForestClassifier(class_weight="balanced", random_state=42))
    ]),
    "XGBoost": Pipeline([
        ("preproc", preprocessor),
        ("clf", XGBClassifier(eval_metric="logloss", use_label_encoder=False, random_state=42))
    ])
}

## Train Models and Create Comparison Table

In [13]:
# List of models
model_list = [
    ("Logistic Regression", models["Logistic Regression"]),
    ("Decision Tree", models["Decision Tree"]),
    ("Logistic + SMOTE", models["Logistic + SMOTE"]),
    ("Random Forest", models["Random Forest"]),
    ("XGBoost", models["XGBoost"])
]

# DataFrame to store results
results = pd.DataFrame(columns=["Model", "Precision", "Recall", "F1-score"])

for name, model in model_list:
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    results = pd.concat([results, pd.DataFrame({
        "Model": [name],
        "Precision": [precision],
        "Recall": [recall],
        "F1-score": [f1]
    })], ignore_index=True)

# Sort table by F1-score
results = results.sort_values(by="F1-score", ascending=False).reset_index(drop=True)
print("Model Comparison Table:")
print(results)


  results = pd.concat([results, pd.DataFrame({


Model Comparison Table:
                 Model  Precision    Recall  F1-score
0        Random Forest   0.984962  0.986817  0.985889
1              XGBoost   0.981273  0.986817  0.984038
2        Decision Tree   0.977612  0.986817  0.982193
3  Logistic Regression   0.955078  0.920904  0.937680
4     Logistic + SMOTE   0.951456  0.922787  0.936902


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
