# Loan Approval Prediction

This notebook preprocesses loan application data and trains a machine learning model to predict loan approval.

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, BaggingClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report
import warnings
warnings.filterwarnings('ignore')

import joblib
import os


In [3]:
# Load dataset
df = pd.read_csv('loan_data.csv')

# Display first few rows
df.head()

Unnamed: 0,person_age,person_gender,person_education,person_income,person_emp_exp,person_home_ownership,loan_amnt,loan_intent,loan_int_rate,loan_percent_income,cb_person_cred_hist_length,credit_score,previous_loan_defaults_on_file,loan_status
0,22.0,female,Master,71948.0,0,RENT,35000.0,PERSONAL,16.02,0.49,3.0,561,No,1
1,21.0,female,High School,12282.0,0,OWN,1000.0,EDUCATION,11.14,0.08,2.0,504,Yes,0
2,25.0,female,High School,12438.0,3,MORTGAGE,5500.0,MEDICAL,12.87,0.44,3.0,635,No,1
3,23.0,female,Bachelor,79753.0,0,RENT,35000.0,MEDICAL,15.23,0.44,2.0,675,No,1
4,24.0,male,Master,66135.0,1,RENT,35000.0,MEDICAL,14.27,0.53,4.0,586,No,1


In [4]:
# Display dataset info
df.info()

# Check for missing values
df.isnull().sum()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45000 entries, 0 to 44999
Data columns (total 14 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   person_age                      45000 non-null  float64
 1   person_gender                   45000 non-null  object 
 2   person_education                45000 non-null  object 
 3   person_income                   45000 non-null  float64
 4   person_emp_exp                  45000 non-null  int64  
 5   person_home_ownership           45000 non-null  object 
 6   loan_amnt                       45000 non-null  float64
 7   loan_intent                     45000 non-null  object 
 8   loan_int_rate                   45000 non-null  float64
 9   loan_percent_income             45000 non-null  float64
 10  cb_person_cred_hist_length      45000 non-null  float64
 11  credit_score                    45000 non-null  int64  
 12  previous_loan_defaults_on_file  

person_age                        0
person_gender                     0
person_education                  0
person_income                     0
person_emp_exp                    0
person_home_ownership             0
loan_amnt                         0
loan_intent                       0
loan_int_rate                     0
loan_percent_income               0
cb_person_cred_hist_length        0
credit_score                      0
previous_loan_defaults_on_file    0
loan_status                       0
dtype: int64

In [5]:
cat_cols = ['person_gender', 'person_education', 'person_home_ownership', 'loan_intent', 'previous_loan_defaults_on_file']
df = pd.get_dummies(df, columns=cat_cols)
df.head()

Unnamed: 0,person_age,person_income,person_emp_exp,loan_amnt,loan_int_rate,loan_percent_income,cb_person_cred_hist_length,credit_score,loan_status,person_gender_female,...,person_home_ownership_OWN,person_home_ownership_RENT,loan_intent_DEBTCONSOLIDATION,loan_intent_EDUCATION,loan_intent_HOMEIMPROVEMENT,loan_intent_MEDICAL,loan_intent_PERSONAL,loan_intent_VENTURE,previous_loan_defaults_on_file_No,previous_loan_defaults_on_file_Yes
0,22.0,71948.0,0,35000.0,16.02,0.49,3.0,561,1,True,...,False,True,False,False,False,False,True,False,True,False
1,21.0,12282.0,0,1000.0,11.14,0.08,2.0,504,0,True,...,True,False,False,True,False,False,False,False,False,True
2,25.0,12438.0,3,5500.0,12.87,0.44,3.0,635,1,True,...,False,False,False,False,False,True,False,False,True,False
3,23.0,79753.0,0,35000.0,15.23,0.44,2.0,675,1,True,...,False,True,False,False,False,True,False,False,True,False
4,24.0,66135.0,1,35000.0,14.27,0.53,4.0,586,1,False,...,False,True,False,False,False,True,False,False,True,False


In [6]:
# Split Data into Training & Testing Sets
X = df.drop(columns=['loan_status'])
y = df['loan_status']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
models = [
    RandomForestClassifier(),
    XGBClassifier(),
    LogisticRegression(),
    GradientBoostingClassifier(),
    AdaBoostClassifier(),
    BaggingClassifier()
]

# Train, evaluate, and save models
for model in models:
    # Get model name
    model_name = model.__class__.__name__
    
    # Train model
    model.fit(X_train, y_train)
    
    # Save the trained model directly in the same directory
    model_path = f"{model_name}.pkl"
    joblib.dump(model, model_path)
    
    # Evaluate model
    test_score = model.score(X_test, y_test)
    y_pred = model.predict(X_test)
    report = classification_report(y_test, y_pred)
    
    # Print results
    print(f"\n🔹 Model: {model_name}")
    print(f"✅ Accuracy Score: {test_score:.4f}")
    print(f"📊 Classification Report:\n{report}")
    print(f"💾 Model saved at: {model_path}")
    print("-" * 60)

print("✅ All models trained and saved in the same directory!")


🔹 Model: RandomForestClassifier
✅ Accuracy Score: 0.9267
📊 Classification Report:
              precision    recall  f1-score   support

           0       0.94      0.97      0.95      6990
           1       0.89      0.77      0.82      2010

    accuracy                           0.93      9000
   macro avg       0.91      0.87      0.89      9000
weighted avg       0.93      0.93      0.92      9000

💾 Model saved at: RandomForestClassifier.pkl
------------------------------------------------------------

🔹 Model: XGBClassifier
✅ Accuracy Score: 0.9338
📊 Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.97      0.96      6990
           1       0.89      0.81      0.84      2010

    accuracy                           0.93      9000
   macro avg       0.92      0.89      0.90      9000
weighted avg       0.93      0.93      0.93      9000

💾 Model saved at: XGBClassifier.pkl
-----------------------------------------------

I will choose XGBClassifier because it has the best score

In [9]:
df.columns

Index(['person_age', 'person_income', 'person_emp_exp', 'loan_amnt',
       'loan_int_rate', 'loan_percent_income', 'cb_person_cred_hist_length',
       'credit_score', 'loan_status', 'person_gender_female',
       'person_gender_male', 'person_education_Associate',
       'person_education_Bachelor', 'person_education_Doctorate',
       'person_education_High School', 'person_education_Master',
       'person_home_ownership_MORTGAGE', 'person_home_ownership_OTHER',
       'person_home_ownership_OWN', 'person_home_ownership_RENT',
       'loan_intent_DEBTCONSOLIDATION', 'loan_intent_EDUCATION',
       'loan_intent_HOMEIMPROVEMENT', 'loan_intent_MEDICAL',
       'loan_intent_PERSONAL', 'loan_intent_VENTURE',
       'previous_loan_defaults_on_file_No',
       'previous_loan_defaults_on_file_Yes'],
      dtype='object')

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45000 entries, 0 to 44999
Data columns (total 28 columns):
 #   Column                              Non-Null Count  Dtype  
---  ------                              --------------  -----  
 0   person_age                          45000 non-null  float64
 1   person_income                       45000 non-null  float64
 2   person_emp_exp                      45000 non-null  int64  
 3   loan_amnt                           45000 non-null  float64
 4   loan_int_rate                       45000 non-null  float64
 5   loan_percent_income                 45000 non-null  float64
 6   cb_person_cred_hist_length          45000 non-null  float64
 7   credit_score                        45000 non-null  int64  
 8   loan_status                         45000 non-null  int64  
 9   person_gender_female                45000 non-null  bool   
 10  person_gender_male                  45000 non-null  bool   
 11  person_education_Associate          45000

In [20]:
from flask import Flask, request, jsonify
import numpy as np
import threading

# Load the trained model
model = joblib.load("loan_approval_model.pkl")

# Initialize Flask app
app = Flask(__name__)

@app.route("/")
def home():
    return "Loan Approval Prediction API is running!"

@app.route("/predict", methods=["POST"])
def predict():
    try:
        # Get JSON data from request
        data = request.get_json()

        # Convert input data into a NumPy array
        features = np.array([list(data.values())]).reshape(1, -1)

        # Make prediction
        prediction = model.predict(features)[0]
        probability = model.predict_proba(features)[0][1]  # Probability of approval

        # Return the response
        return jsonify({
            "loan_approval_status": "Approved" if prediction == 1 else "Denied",
            "approval_probability": round(probability, 2)
        })

    except Exception as e:
        return jsonify({"error": str(e)})

# Function to run Flask in a separate thread
def run_api():
    app.run(host="0.0.0.0", port=5000)

# Start the API in a thread so Jupyter Notebook doesn't freeze
thread = threading.Thread(target=run_api)
thread.start()


In [26]:
import requests

url = "http://127.0.0.1:5000/predict"
data = {
    "person_age": 30,
    "person_income": 60000,
    "person_emp_exp": 5,
    "loan_amnt": 10000,
    "loan_int_rate": 5.5,
    "loan_percent_income": 0.15,
    "cb_person_cred_hist_length": 10,
    "credit_score": 720,
    "person_gender": 1,
    "person_education": 1,
    "person_home_ownership": 1,
    "loan_intent_DEBTCONSOLIDATION": 1,
    "previous_loan_defaults_on_file_Yes": 0
}

response = requests.post(url, json=data)
print(response.json())


127.0.0.1 - - [28/Mar/2025 18:37:56] "POST /predict HTTP/1.1" 200 -


{'approval_probability': 0.44, 'loan_approval_status': 'Denied'}


In [28]:
%%writefile app.py

from flask import Flask, request, jsonify
import joblib
import numpy as np

# Load the trained model
model = joblib.load("XGBClassifier.pkl")

# Initialize Flask app
app = Flask(__name__)

@app.route("/")
def home():
    return "Loan Approval Prediction API is running!"

@app.route("/predict", methods=["POST"])
def predict():
    try:
        # Get JSON data from request
        data = request.get_json()

        # Convert input data into a NumPy array
        features = np.array([list(data.values())]).reshape(1, -1)

        # Make prediction
        prediction = model.predict(features)[0]
        probability = model.predict_proba(features)[0][1]  # Probability of approval

        # Return the response
        return jsonify({
            "loan_approval_status": "Approved" if prediction == 1 else "Denied",
            "approval_probability": round(probability, 2)
        })

    except Exception as e:
        return jsonify({"error": str(e)})

if __name__ == "__main__":
    app.run(host="0.0.0.0", port=5000)


Writing app.py


In [30]:
%%writefile requirements.txt
flask
numpy
joblib
scikit-learn

Writing requirements.txt


In [3]:
import requests
sample_data = {
    "person_age": 35.0,
    "person_income": 750000.0,
    "person_emp_exp": 10,
    "loan_amnt": 15000.0,
    "loan_int_rate": 7.5,
    "loan_percent_income": 20.0,
    "cb_person_cred_hist_length": 6.0,
    "credit_score": 700,
    "person_gender_female": False,
    "person_gender_male": True,
    "person_education_Associate": False,
    "person_education_Bachelor": True,
    "person_education_Doctorate": False,
    "person_education_High School": False,
    "person_education_Master": False,
    "person_home_ownership_MORTGAGE": True,
    "person_home_ownership_OTHER": False,
    "person_home_ownership_OWN": False,
    "person_home_ownership_RENT": False,
    "loan_intent_DEBTCONSOLIDATION": True,
    "loan_intent_EDUCATION": False,
    "loan_intent_HOMEIMPROVEMENT": False,
    "loan_intent_MEDICAL": False,
    "loan_intent_PERSONAL": False,
    "loan_intent_VENTURE": False,
    "previous_loan_defaults_on_file_No": True,
    "previous_loan_defaults_on_file_Yes": False
}


In [5]:
url = "https://felix-loan-pproval-api.onrender.com/predict"  
response = requests.post(url, json=sample_data)

print("Status Code:", response.status_code)
print("Response:", response.json())  

Status Code: 200
Response: {'approval_probability': 0.02, 'loan_approval_status': 'Denied'}
