Importing libraries

In [99]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns 
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder, LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier



Reading the Dataset

In [100]:
df = pd.read_csv('loan_approval_data.csv')

In [101]:
df.head()

Unnamed: 0,loan_id,no_of_dependents,education,self_employed,employment_type,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value,loan_status
0,1,2,12th,No,Salaried,9600000,29900000,12,778,2400000,17600000,22700000,8000000,Approved
1,2,0,12th,No,Salaried,4100000,12200000,8,417,2700000,2200000,8800000,3300000,Rejected
2,3,3,8th,No,Salaried,9100000,29700000,20,506,7100000,4500000,33300000,12800000,Rejected
3,4,3,8th,No,Salaried,8200000,30700000,8,467,18200000,3300000,23300000,7900000,Rejected
4,5,5,Graduate,No,Salaried,9800000,24200000,20,382,12400000,8200000,29400000,5000000,Rejected


In [102]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4269 entries, 0 to 4268
Data columns (total 14 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   loan_id                   4269 non-null   int64 
 1   no_of_dependents          4269 non-null   int64 
 2   education                 4269 non-null   object
 3   self_employed             4269 non-null   object
 4   employment_type           4269 non-null   object
 5   income_annum              4269 non-null   int64 
 6   loan_amount               4269 non-null   int64 
 7   loan_term                 4269 non-null   int64 
 8   cibil_score               4269 non-null   int64 
 9   residential_assets_value  4269 non-null   int64 
 10  commercial_assets_value   4269 non-null   int64 
 11  luxury_assets_value       4269 non-null   int64 
 12  bank_asset_value          4269 non-null   int64 
 13  loan_status               4269 non-null   object
dtypes: int64(10), object(4)


In [103]:
df.describe()

Unnamed: 0,loan_id,no_of_dependents,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value
count,4269.0,4269.0,4269.0,4269.0,4269.0,4269.0,4269.0,4269.0,4269.0,4269.0
mean,2135.0,2.498712,5059124.0,15133450.0,10.900445,599.936051,7472617.0,4973155.0,15126310.0,4976692.0
std,1232.498479,1.69591,2806840.0,9043363.0,5.709187,172.430401,6503637.0,4388966.0,9103754.0,3250185.0
min,1.0,0.0,200000.0,300000.0,2.0,300.0,-100000.0,0.0,300000.0,0.0
25%,1068.0,1.0,2700000.0,7700000.0,6.0,453.0,2200000.0,1300000.0,7500000.0,2300000.0
50%,2135.0,3.0,5100000.0,14500000.0,10.0,600.0,5600000.0,3700000.0,14600000.0,4600000.0
75%,3202.0,4.0,7500000.0,21500000.0,16.0,748.0,11300000.0,7600000.0,21700000.0,7100000.0
max,4269.0,5.0,9900000.0,39500000.0,20.0,900.0,29100000.0,19400000.0,39200000.0,14700000.0


Dropping Columns


In [104]:
df = df.drop(['loan_id','self_employed'],axis=1)

Splitting into independent variables and dependent variable.

In [105]:
X = df.drop('loan_status',axis=1)
y = df['loan_status']


Splitting into training and test sets

In [106]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=42)

Preprocessing columns

In [107]:
numerical_cols = X.select_dtypes(include=np.number).columns
ordinal_cols = ['education'] 
nominal_cols = ['employment_type']

education_categories = [['12th', '8th', 'Graduate', '10th']]

Creating Pipelines for preprocssing of each column types

In [108]:
numerical_pipeline = Pipeline(steps=[
    ('scaler', StandardScaler())
])

nominal_pipeline = Pipeline(steps=[
    ('onehot', OneHotEncoder(drop='first', sparse_output=False)),
])

ordinal_pipeline = Pipeline(steps=[
    ('ordinal', OrdinalEncoder(categories=education_categories)),
])


Column Transformer to apply transformations

In [109]:
preprocessor = ColumnTransformer(transformers=[
    ('num', numerical_pipeline, numerical_cols),
    ('cat', nominal_pipeline,nominal_cols),
    ('ord', ordinal_pipeline, ordinal_cols)
])

Fitting and transforming training sets

In [110]:
preprocessed_X_train = preprocessor.fit_transform(X_train)
preprocessed_X_test = preprocessor.transform(X_test)


print(preprocessed_X_train.shape)
print(preprocessed_X_test.shape)

(3415, 12)
(854, 12)


Label Encoder to encode target variable

In [111]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

In [112]:
print(y_train_encoded.shape)
print(y_test_encoded.shape)

(3415,)
(854,)


Mutual Information Score between each feature and the 'Loan_Status' variable

In [113]:

from sklearn.feature_selection import mutual_info_classif
mi_scores = mutual_info_classif(preprocessed_X_train, y_train_encoded,random_state=42)


feature_names =  preprocessor.get_feature_names_out().tolist()

mi_df = pd.DataFrame({
    'Feature': feature_names,
    'Mutual Information': mi_scores
})


mi_df = mi_df.sort_values(by='Mutual Information', ascending=False)
print(mi_df)

                            Feature  Mutual Information
4                  num__cibil_score            0.501844
3                    num__loan_term            0.012186
10    cat__employment_type_Salaried            0.012166
2                  num__loan_amount            0.009116
5     num__residential_assets_value            0.004877
1                 num__income_annum            0.003874
8             num__bank_asset_value            0.002274
7          num__luxury_assets_value            0.001114
0             num__no_of_dependents            0.000000
6      num__commercial_assets_value            0.000000
9   cat__employment_type_Freelancer            0.000000
11                   ord__education            0.000000


Initializing models

In [114]:
models = {
            "Random Forest": RandomForestClassifier(random_state=42),
            "Logistic Regression": LogisticRegression(random_state=42),
            "KNN": KNeighborsClassifier(),
            "Decision Tree": DecisionTreeClassifier(random_state=42),
            "SVM": SVC(random_state=42),
            "XGBoost": XGBClassifier(random_state=42),
        }

In [115]:
from sklearn.metrics import accuracy_score
for i in range(len(list(models))):
    model = list(models.values())[i]
    
    model.fit(preprocessed_X_train, y_train_encoded)

    y_train_pred = model.predict(preprocessed_X_train)
    y_test_pred = model.predict(preprocessed_X_test)

    train_accuracy = accuracy_score(y_train_encoded, y_train_pred)
    test_accuracy = accuracy_score(y_test_encoded, y_test_pred)

    print(f"Model: {list(models.keys())[i]}",(train_accuracy,test_accuracy))

Model: Random Forest (1.0, 0.9754098360655737)
Model: Logistic Regression (0.9200585651537335, 0.9039812646370023)
Model: KNN (0.9370424597364568, 0.8899297423887588)
Model: Decision Tree (1.0, 0.968384074941452)
Model: SVM (0.9546120058565154, 0.9285714285714286)
Model: XGBoost (1.0, 0.977751756440281)


Initializing HyperParameters

In [116]:
from scipy.stats import randint, uniform
import numpy as np

models_params = {
    'LogisticRegression': {
        'model': LogisticRegression(),
        'params': {
            'C': np.logspace(-3, 3, 7),
            'penalty': ['l1', 'l2'],
            'solver': ['liblinear', 'saga'],
            'class_weight': [None, 'balanced']  
        }
    },
    'DecisionTreeClassifier': {
        'model': DecisionTreeClassifier(),
        'params': {
            'max_depth': [3,5,7,10,15],
            'min_samples_split': [3,5,10,12],
            'min_samples_leaf': [8,10,12,18,20,16],
            'criterion': ['gini', 'entropy'],
            'class_weight': [None, 'balanced']  
        }
    },
    'RandomForestClassifier': {
        'model': RandomForestClassifier(),
        'params': {
            'n_estimators': [25, 50, 100, 150], 
            'max_features': ['sqrt', 'log2', None], 
            'max_depth': [3, 6, 9,12], 
            'min_samples_split': [3,5,10,12],
            'class_weight': [None, 'balanced']  
        }
    },
    'KNeighborsClassifier': {
        'model': KNeighborsClassifier(),
        'params': {
            'n_neighbors': [3,5,7,9,11,13,15],
            'weights': ['uniform', 'distance'],
            'metric' : ['minkowski','euclidean','manhattan']
        }
    },
    'SVC': {
        'model': SVC(),
        'params': {
            'C': np.logspace(-2, 2, 5),
            'kernel': ['linear', 'rbf', 'poly', 'sigmoid'],
            'gamma': ['scale', 'auto'],
            'degree': randint(2, 5)  ,
            'class_weight': [None, 'balanced']  
        }
    },
    'XGBClassifier': {
        'model': XGBClassifier(),
        'params': {
            'n_estimators': [25, 50, 100, 150],
            'learning_rate': [0.1, 0.01, 0.001,10],
            'max_depth': [3,5,7,9],
            'subsample': [0.5, 0.7, 1],
            'colsample_bytree': [0.4,0.6,0.8]
        }
    }
}

RandomSearchCV Hyperparameter Tuning

In [117]:
from sklearn.model_selection import RandomizedSearchCV
cv_folds = 5
n = 10
best_results = {}

for model_name, model_info in models_params.items():
    print(f"RandomizedSearchCV for {model_name}:")

    random_search = RandomizedSearchCV(
        estimator=model_info['model'], 
        param_distributions=model_info['params'], 
        n_iter=n,  
        scoring='accuracy',  
        n_jobs=-1,             
        cv=cv_folds,          
        verbose=1,             
        random_state=42      
    )

    random_search.fit(preprocessed_X_train, y_train_encoded)  
    
    best_results[model_name] = {
        'best_score': random_search.best_score_,
        'best_params': random_search.best_params_
    }

    print(f"Best parameters for {model_name}: {random_search.best_params_}")
    print(f"Best cross-validation score for {model_name}: {random_search.best_score_}")
    
    print()



RandomizedSearchCV for LogisticRegression:
Fitting 5 folds for each of 10 candidates, totalling 50 fits
Best parameters for LogisticRegression: {'solver': 'saga', 'penalty': 'l1', 'class_weight': 'balanced', 'C': np.float64(0.01)}
Best cross-validation score for LogisticRegression: 0.9420204978038067

RandomizedSearchCV for DecisionTreeClassifier:
Fitting 5 folds for each of 10 candidates, totalling 50 fits
Best parameters for DecisionTreeClassifier: {'min_samples_split': 10, 'min_samples_leaf': 12, 'max_depth': 10, 'criterion': 'entropy', 'class_weight': 'balanced'}
Best cross-validation score for DecisionTreeClassifier: 0.9768667642752563

RandomizedSearchCV for RandomForestClassifier:
Fitting 5 folds for each of 10 candidates, totalling 50 fits
Best parameters for RandomForestClassifier: {'n_estimators': 150, 'min_samples_split': 5, 'max_features': None, 'max_depth': 12, 'class_weight': 'balanced'}
Best cross-validation score for RandomForestClassifier: 0.9821376281112737

Randomize

In [118]:
from sklearn.metrics import accuracy_score 
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report


for model_name, model_info in models_params.items():
    
    random_search = RandomizedSearchCV(
        estimator=model_info['model'], 
        param_distributions=model_info['params'], 
        n_iter=10,  
        scoring='accuracy',  
        n_jobs=-1,  
        cv=cv_folds,  
        verbose=1,  
        random_state=42  
    )
    
    
    random_search.fit(preprocessed_X_train, y_train_encoded)
    
    best_model = random_search.best_estimator_
    

    y_train_pred = best_model.predict(preprocessed_X_train)
    

    y_test_pred = best_model.predict(preprocessed_X_test)

    train_accuracy = accuracy_score(y_train_encoded, y_train_pred)
    test_accuracy = accuracy_score(y_test_encoded, y_test_pred)
    

    print(f"{model_name} - Training Accuracy: {train_accuracy:.4f}, Test Accuracy: {test_accuracy:.4f}")
    print(f"{model_name} - Cross-validation Best Score: {random_search.best_score_:.4f}")
    print(f"{model_name} - Test Classification Report:\n {classification_report(y_test_encoded, y_test_pred)}")
    print("="*60)





Fitting 5 folds for each of 10 candidates, totalling 50 fits
LogisticRegression - Training Accuracy: 0.9403, Test Accuracy: 0.9368
LogisticRegression - Cross-validation Best Score: 0.9420
LogisticRegression - Test Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.91      0.95       536
           1       0.86      0.98      0.92       318

    accuracy                           0.94       854
   macro avg       0.93      0.95      0.93       854
weighted avg       0.94      0.94      0.94       854

Fitting 5 folds for each of 10 candidates, totalling 50 fits
DecisionTreeClassifier - Training Accuracy: 0.9818, Test Accuracy: 0.9754
DecisionTreeClassifier - Cross-validation Best Score: 0.9769
DecisionTreeClassifier - Test Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.97      0.98       536
           1       0.95      0.98      0.97       318

    accuracy         

In [119]:
rfc = RandomForestClassifier(n_estimators=50, max_depth=12, random_state=42, min_samples_split=3)
rfc.fit(preprocessed_X_train, y_train_encoded)
feature_names = preprocessor.get_feature_names_out().tolist()
feature_importances = rfc.feature_importances_

feature_importance_dict = {name: importance for name, importance in zip(feature_names, feature_importances)}
sorted_feature_importance = sorted(feature_importance_dict.items(), key=lambda x: x[1], reverse=True)


for feature, importance in sorted_feature_importance:
    print(f"Feature: {feature}, Importance: {importance}")

Feature: num__cibil_score, Importance: 0.8126845929725851
Feature: num__loan_term, Importance: 0.06455472023516833
Feature: num__loan_amount, Importance: 0.026798223144888608
Feature: num__residential_assets_value, Importance: 0.017362833032959015
Feature: num__luxury_assets_value, Importance: 0.01639051104684645
Feature: num__income_annum, Importance: 0.016375823877747445
Feature: num__bank_asset_value, Importance: 0.015050444713407702
Feature: num__commercial_assets_value, Importance: 0.014815017831942957
Feature: num__no_of_dependents, Importance: 0.0073900154536604345
Feature: ord__education, Importance: 0.00551679318221108
Feature: cat__employment_type_Salaried, Importance: 0.00171262990430104
Feature: cat__employment_type_Freelancer, Importance: 0.001348394604281885


In [120]:
from sklearn.ensemble import VotingClassifier

logreg = LogisticRegression(C=0.1, solver='liblinear', penalty='l2', random_state=42,class_weight='balanced')
rfc = RandomForestClassifier(n_estimators=50, max_depth=12, random_state=42,min_samples_split= 3,class_weight='balanced')
xgb = XGBClassifier(n_estimators=50, max_depth=5, learning_rate=0.1, subsample=0.7, colsample_bytree= 0.8,random_state=42)
dtc = DecisionTreeClassifier(max_depth=15, min_samples_split=3, criterion='gini', min_samples_leaf=12,random_state=42,class_weight='balanced',)


voting_clf = VotingClassifier(estimators=[

    ('rfc', rfc),
    ('xgb', xgb),
    ('dtc', dtc),
], voting='hard')


voting_clf.fit(preprocessed_X_train, y_train_encoded)
y_test_pred = voting_clf.predict(preprocessed_X_test)
Y_train_pred = voting_clf.predict(preprocessed_X_train)

print(f'Ensemble Voting Classifier Test Accuracy: {accuracy_score(y_test_encoded, y_test_pred):.4f}')
print(f'Ensemble Voting Classifier Train Accuracy: {accuracy_score(y_train_encoded, y_train_pred):.4f}')
print(f"{model_name} - Test Classification Report:\n {classification_report(y_test_encoded, y_test_pred)}")

Ensemble Voting Classifier Test Accuracy: 0.9766
Ensemble Voting Classifier Train Accuracy: 0.9974
XGBClassifier - Test Classification Report:
               precision    recall  f1-score   support

           0       0.98      0.99      0.98       536
           1       0.97      0.96      0.97       318

    accuracy                           0.98       854
   macro avg       0.98      0.97      0.97       854
weighted avg       0.98      0.98      0.98       854



In [121]:
from sklearn.ensemble import StackingClassifier

stacking_clf = StackingClassifier(
    estimators=[

        ('rfc', rfc),
        ('xgb', xgb),
    ],
    final_estimator=LogisticRegression() 
)

stacking_clf.fit(preprocessed_X_train, y_train_encoded)
y_test_pred = stacking_clf.predict(preprocessed_X_test)
Y_train_pred = stacking_clf.predict(preprocessed_X_train)
print(f'Ensemble Voting Classifier Test Accuracy: {accuracy_score(y_test_encoded, y_test_pred):.4f}')

print(f'Ensemble Voting Classifier Train Accuracy: {accuracy_score(y_train_encoded, y_train_pred):.4f}')

print(f"{model_name} - Test Classification Report:\n {classification_report(y_test_encoded, y_test_pred)}")

Ensemble Voting Classifier Test Accuracy: 0.9766
Ensemble Voting Classifier Train Accuracy: 0.9974
XGBClassifier - Test Classification Report:
               precision    recall  f1-score   support

           0       0.98      0.99      0.98       536
           1       0.97      0.96      0.97       318

    accuracy                           0.98       854
   macro avg       0.98      0.97      0.97       854
weighted avg       0.98      0.98      0.98       854



In [127]:
import joblib
rfc = RandomForestClassifier(n_estimators=50, max_depth=12, random_state=42,min_samples_split= 3,class_weight='balanced')
rfc.fit(preprocessed_X_train, y_train_encoded)
joblib.dump(rfc, 'rfc_model.pkl')

['rfc_model.pkl']

In [128]:

def predict_new_data(new_data, preprocessor, model):

    preprocessed_new_data = preprocessor.transform(new_data)

    predictions = model.predict(preprocessed_new_data)
    
    return predictions


new_data = {
    'no_of_dependents': 2,
    'education': 'Graduate',  
    'self_employed': 'Yes',
    'employment_type': 'Salaried',
    'income_annum': 500000,
    'loan_amount': 100000,
    'loan_term': 15,
    'cibil_score': 750,
    'residential_assets_value': 500000,
    'commercial_assets_value': 100000,
    'luxury_assets_value': 200000,
    'bank_asset_value': 300000
}

new_data_df = pd.DataFrame([new_data])

rfc = joblib.load('rfc_model.pkl')
predicted_loan_status = predict_new_data(df, preprocessor, rfc)  

predicted_loan_status = predict_new_data(new_data_df, preprocessor, rfc)  


loan_status_decoded = label_encoder.inverse_transform(predicted_loan_status)

print(f"Predicted Loan Status: {loan_status_decoded[0]}")


Predicted Loan Status: Approved
