# Random Forest (BAGGING)

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix


In [2]:
# Load the dataset
data = pd.read_csv('credit_risk_dataset.csv/credit_risk_dataset.csv')

# Handle missing values
data['person_emp_length'].fillna(data['person_emp_length'].median(), inplace=True)
data['loan_int_rate'].fillna(data['loan_int_rate'].median(), inplace=True)

# Encode categorical variables
categorical_features = ['person_home_ownership', 'loan_intent', 'loan_grade', 'cb_person_default_on_file']
data_encoded = pd.get_dummies(data, columns=categorical_features, drop_first=True)


In [3]:
# Define features and target
X = data_encoded.drop(columns=['loan_status'])
y = data['loan_status']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


In [4]:
# Scale numeric features
numeric_features = ['person_age', 'person_income', 'person_emp_length', 'loan_amnt', 
                    'loan_int_rate', 'loan_percent_income', 'cb_person_cred_hist_length']
scaler = StandardScaler()
X_train[numeric_features] = scaler.fit_transform(X_train[numeric_features])
X_test[numeric_features] = scaler.transform(X_test[numeric_features])

# Train Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)



In [None]:
# Save the scaler and model as .pkl files

import joblib

joblib.dump(scaler, 'model/scaler.pkl')  # Save the scaler
joblib.dump(rf_model, 'model/random_forest_model.pkl')  # Save the trained model


['model/random_forest_model.pkl']

In [None]:
# Load the scaler and model
scaler = joblib.load('model/scaler.pkl')
rf_model = joblib.load('model/random_forest_model.pkl')


In [13]:
# Predictions and evaluation
y_pred = rf_model.predict(X_test)
y_pred_proba = rf_model.predict_proba(X_test)[:, 1]

# Evaluation Metrics
print("Classification Report:\n", classification_report(y_test, y_pred))
print("ROC-AUC Score:", roc_auc_score(y_test, y_pred_proba))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


Classification Report:
               precision    recall  f1-score   support

           0       0.93      0.99      0.96      5095
           1       0.96      0.72      0.82      1422

    accuracy                           0.93      6517
   macro avg       0.94      0.86      0.89      6517
weighted avg       0.93      0.93      0.93      6517

ROC-AUC Score: 0.9313731092367383
Confusion Matrix:
 [[5047   48]
 [ 396 1026]]


In [21]:
# Sample unseen data (replace with your actual unseen data)
unseen_data = pd.DataFrame([{
    'person_age': 30,
    'person_income': 60000,
    'person_home_ownership': 'RENT',
    'person_emp_length': 5.0,
    'loan_intent': 'PERSONAL',
    'loan_amnt': 10000,
    'loan_int_rate': 12.0,
    'loan_percent_income': 0.15,
    'cb_person_default_on_file': 'N',
    'cb_person_cred_hist_length': 5,
    'loan_grade': 'B'
}])

# Handle missing values in unseen data
unseen_data['person_emp_length'].fillna(unseen_data['person_emp_length'].median(), inplace=True)
unseen_data['loan_int_rate'].fillna(unseen_data['loan_int_rate'].median(), inplace=True)

# Encode categorical variables
categorical_features = ['person_home_ownership', 'loan_intent', 'loan_grade', 'cb_person_default_on_file']
unseen_data_encoded = pd.get_dummies(unseen_data, columns=categorical_features, drop_first=True)

# Ensure all columns from training are present
missing_cols = set(X_train.columns) - set(unseen_data_encoded.columns)
for col in missing_cols:
    unseen_data_encoded[col] = 0

# Ensure columns are in the same order as during training
unseen_data_encoded = unseen_data_encoded[X_train.columns]

# Scale numeric features
numeric_features = ['person_age', 'person_income', 'person_emp_length', 'loan_amnt', 
                   'loan_int_rate', 'loan_percent_income', 'cb_person_cred_hist_length']
unseen_data_encoded[numeric_features] = scaler.transform(unseen_data_encoded[numeric_features])

# Make prediction
prediction = rf_model.predict(unseen_data_encoded)
prediction_prob = rf_model.predict_proba(unseen_data_encoded)

# Print results
print("Loan Status Prediction:", "Approved" if prediction[0] == 1 else "Denied")
print("Probability of Approval:", f"{prediction_prob[0][1]:.2%}")
print("Probability of Denial:", f"{prediction_prob[0][0]:.2%}")


Loan Status Prediction: Denied
Probability of Approval: 16.00%
Probability of Denial: 84.00%
