In [2]:
!pip install xgboost

Defaulting to user installation because normal site-packages is not writeable
Collecting xgboost
  Downloading xgboost-2.1.4-py3-none-win_amd64.whl (124.9 MB)
     -------------------------------------- 124.9/124.9 MB 9.0 MB/s eta 0:00:00
Installing collected packages: xgboost
Successfully installed xgboost-2.1.4


In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    accuracy_score, roc_auc_score, precision_score,
    recall_score, f1_score, matthews_corrcoef, confusion_matrix
)

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier

from xgboost import XGBClassifier

import joblib
import os

In [2]:
import kagglehub
path = kagglehub.dataset_download("parthpatel2130/realistic-loan-approval-dataset-us-and-canada")
print("Dataset downloaded to:", path)

Dataset downloaded to: C:\Users\Shreyash\.cache\kagglehub\datasets\parthpatel2130\realistic-loan-approval-dataset-us-and-canada\versions\1


In [3]:

csv_file = None
for f in os.listdir(path):
    if f.endswith(".csv"):
        csv_file = f
        break

df = pd.read_csv(path + "/" + csv_file)
print("Dataset loaded:", csv_file)



dataset_name = "Realistic Loan Approval Dataset (US & Canada)"  
dataset_source = "Kaggle"  
n_samples = df.shape[0]     
n_features = df.shape[1] - 1    
problem_type = "binary_classification"  

print(f"Dataset: {dataset_name}")
print(f"Source: {dataset_source}")
print(f"Samples: {n_samples}, Features: {n_features}")
print(f"Problem Type: {problem_type}")


Dataset loaded: Loan_approval_data_2025.csv
Dataset: Realistic Loan Approval Dataset (US & Canada)
Source: Kaggle
Samples: 50000, Features: 19
Problem Type: binary_classification


In [4]:
print("Dataset shape:", df.shape)
df.head()

Dataset shape: (50000, 20)


Unnamed: 0,customer_id,age,occupation_status,years_employed,annual_income,credit_score,credit_history_years,savings_assets,current_debt,defaults_on_file,delinquencies_last_2yrs,derogatory_marks,product_type,loan_intent,loan_amount,interest_rate,debt_to_income_ratio,loan_to_income_ratio,payment_to_income_ratio,loan_status
0,CUST100000,40,Employed,17.2,25579,692,5.3,895,10820,0,0,0,Credit Card,Business,600,17.02,0.423,0.023,0.008,1
1,CUST100001,33,Employed,7.3,43087,627,3.5,169,16550,0,1,0,Personal Loan,Home Improvement,53300,14.1,0.384,1.237,0.412,0
2,CUST100002,42,Student,1.1,20840,689,8.4,17,7852,0,0,0,Credit Card,Debt Consolidation,2100,18.33,0.377,0.101,0.034,1
3,CUST100003,53,Student,0.5,29147,692,9.8,1480,11603,0,1,0,Credit Card,Business,2900,18.74,0.398,0.099,0.033,1
4,CUST100004,32,Employed,12.5,63657,630,7.2,209,12424,0,0,0,Personal Loan,Education,99600,13.92,0.195,1.565,0.522,1


In [5]:
df.info()
df.isnull().sum()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 20 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   customer_id              50000 non-null  object 
 1   age                      50000 non-null  int64  
 2   occupation_status        50000 non-null  object 
 3   years_employed           50000 non-null  float64
 4   annual_income            50000 non-null  int64  
 5   credit_score             50000 non-null  int64  
 6   credit_history_years     50000 non-null  float64
 7   savings_assets           50000 non-null  int64  
 8   current_debt             50000 non-null  int64  
 9   defaults_on_file         50000 non-null  int64  
 10  delinquencies_last_2yrs  50000 non-null  int64  
 11  derogatory_marks         50000 non-null  int64  
 12  product_type             50000 non-null  object 
 13  loan_intent              50000 non-null  object 
 14  loan_amount           

customer_id                0
age                        0
occupation_status          0
years_employed             0
annual_income              0
credit_score               0
credit_history_years       0
savings_assets             0
current_debt               0
defaults_on_file           0
delinquencies_last_2yrs    0
derogatory_marks           0
product_type               0
loan_intent                0
loan_amount                0
interest_rate              0
debt_to_income_ratio       0
loan_to_income_ratio       0
payment_to_income_ratio    0
loan_status                0
dtype: int64

In [6]:

df['loan_status'].value_counts()


1    27523
0    22477
Name: loan_status, dtype: int64

In [7]:
categorical_cols = [
    'occupation_status',
    'product_type',
    'loan_intent'
]

df = pd.get_dummies(df, columns=categorical_cols, drop_first=True)
print("New shape after encoding:", df.shape)


New shape after encoding: (50000, 26)


In [10]:
# Drop ID column (not a feature)
df.drop(columns=['customer_id'], inplace=True)


In [11]:
X = df.drop('loan_status', axis=1)
y = df['loan_status']

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

print(X_train.shape, X_test.shape)


(40000, 24) (10000, 24)


In [12]:
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

os.makedirs("model", exist_ok=True)
joblib.dump(scaler, "model/scaler.pkl")


['model/scaler.pkl']

In [13]:
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "KNN": KNeighborsClassifier(n_neighbors=5),
    "Naive Bayes": GaussianNB(),
    "Random Forest": RandomForestClassifier(
        n_estimators=100,
        random_state=42
    ),
    "XGBoost": XGBClassifier(
        use_label_encoder=False,
        eval_metric='logloss',
        random_state=42
    )
}


In [14]:
results = []

def evaluate_model(name, model, X_tr, X_te):
    model.fit(X_tr, y_train)
    
    y_pred = model.predict(X_te)
    y_prob = model.predict_proba(X_te)[:, 1]
    
    metrics = {
        "Model": name,
        "Accuracy": accuracy_score(y_test, y_pred),
        "AUC": roc_auc_score(y_test, y_prob),
        "Precision": precision_score(y_test, y_pred),
        "Recall": recall_score(y_test, y_pred),
        "F1 Score": f1_score(y_test, y_pred),
        "MCC": matthews_corrcoef(y_test, y_pred)
    }
    
    return metrics


In [15]:
for name, model in models.items():
    
    # Scale-sensitive models
    if name in ["Logistic Regression", "KNN"]:
        metrics = evaluate_model(name, model, X_train_scaled, X_test_scaled)
    else:
        metrics = evaluate_model(name, model, X_train, X_test)
    
    results.append(metrics)
    
    # Save model
    filename = name.lower().replace(" ", "_") + ".pkl"
    joblib.dump(model, f"model/{filename}")


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
Parameters: { "use_label_encoder" } are not used.



In [16]:
results_df = pd.DataFrame(results)
results_df


Unnamed: 0,Model,Accuracy,AUC,Precision,Recall,F1 Score,MCC
0,Logistic Regression,0.8653,0.944563,0.871117,0.886467,0.878725,0.727424
1,Decision Tree,0.8715,0.869594,0.87936,0.888465,0.883889,0.740098
2,KNN,0.8673,0.932018,0.854188,0.915168,0.883627,0.732046
3,Naive Bayes,0.7436,0.827974,0.799186,0.713533,0.753935,0.491501
4,Random Forest,0.9099,0.972544,0.915224,0.921708,0.918454,0.817823
5,XGBoost,0.9243,0.982572,0.923475,0.940418,0.931869,0.846901


In [17]:
best_model = joblib.load("model/xgboost.pkl")

y_pred = best_model.predict(X_test)
confusion_matrix(y_test, y_pred)


array([[4066,  429],
       [ 328, 5177]], dtype=int64)

In [18]:
results_df.to_csv("model/model_comparison_results.csv", index=False)
